diff --git a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S index 0adb95c3b7..9e39fe8fd5 100644 --- a/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S +++ b/mindspore/lite/nnacl/assembly/avx/MatmulAvx.S @@ -840,7 +840,7 @@ LoopRow: vmovups %ymm12, (%rdx) addq %r12, %rdx vmovups %ymm14, (%rdx) - cmpq $-8, %rbx + cmpq $8, %rbx je WriteEnd movq %rax, %rdx addq %r13, %rax diff --git a/mindspore/lite/nnacl/fp32/pooling_fp32.c b/mindspore/lite/nnacl/fp32/pooling_fp32.c index a674e9e12c..9e74c81c98 100644 --- a/mindspore/lite/nnacl/fp32/pooling_fp32.c +++ b/mindspore/lite/nnacl/fp32/pooling_fp32.c @@ -17,6 +17,7 @@ #include "nnacl/fp32/pooling_fp32.h" #include #include "nnacl/errorcode.h" +#include "nnacl/op_base.h" int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter *pooling_param, int task_id, float minf, float maxf) { @@ -32,9 +33,9 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter int out_tile_count = UP_DIV(out_plane, TILE_NUM); int window = win_w * win_h; -#ifdef ENABLE_NEON - float32x4_t min_value = vdupq_n_f32(minf); - float32x4_t max_value = vdupq_n_f32(maxf); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); + MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); #endif for (int batch = 0; batch < pooling_param->output_batch_; batch++) { @@ -61,8 +62,8 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter for (int ci = 0; ci < c4; ci++) { const float *src_c_ptr = src_plane_ptr + ci * C4NUM; float *dst_c_ptr = dst_plane_ptr + ci * C4NUM; -#ifdef ENABLE_NEON - float32x4_t tmp_avg = vdupq_n_f32(0); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0); #else float tmp_avg1 = 0; float tmp_avg2 = 0; @@ -73,8 +74,8 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter for (int h = real_win_h_start; h < real_win_h_end; h++) { for (int w = real_win_w_start; w < real_win_w_end; w++) { const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; -#ifdef ENABLE_NEON - tmp_avg = vaddq_f32(tmp_avg, vld1q_f32(src_win_ptr)); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + tmp_avg = MS_ADDQ_F32(tmp_avg, MS_LDQ_F32(src_win_ptr)); #else tmp_avg1 += src_win_ptr[0]; tmp_avg2 += src_win_ptr[1]; @@ -90,11 +91,11 @@ int AvgPooling(const float *input_ptr, float *output_ptr, const PoolingParameter if (real_count == 0) { return NNACL_ERR; } -#ifdef ENABLE_NEON - tmp_avg = tmp_avg / vdupq_n_f32(real_count); - tmp_avg = vmaxq_f32(tmp_avg, min_value); - tmp_avg = vminq_f32(tmp_avg, max_value); - vst1q_f32(dst_c_ptr, tmp_avg); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + tmp_avg = tmp_avg / MS_MOVQ_F32(real_count); + tmp_avg = MS_MAXQ_F32(tmp_avg, min_value); + tmp_avg = MS_MINQ_F32(tmp_avg, max_value); + MS_STQ_F32(dst_c_ptr, tmp_avg); #else tmp_avg1 /= (float)real_count; tmp_avg2 /= (float)real_count; @@ -158,9 +159,9 @@ void MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParamete int out_tile_count = UP_DIV(out_plane, TILE_NUM); int c4 = channel / C4NUM; /* oc && ic */ -#ifdef ENABLE_NEON - float32x4_t min_value = vdupq_n_f32(minf); - float32x4_t max_value = vdupq_n_f32(maxf); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); + MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); #endif for (int batch = 0; batch < output_batch; batch++) { @@ -187,8 +188,8 @@ void MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParamete for (int ci = 0; ci < c4; ci++) { const float *src_c_ptr = src_plane_ptr + ci * C4NUM; float *dst_c_ptr = dst_plane_ptr + ci * C4NUM; -#ifdef ENABLE_NEON - float32x4_t tmp_max = vdupq_n_f32(-FLT_MAX); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX); #else float tmp_max1 = -FLT_MAX; float tmp_max2 = -FLT_MAX; @@ -199,8 +200,8 @@ void MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParamete for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; -#ifdef ENABLE_NEON - tmp_max = vmaxq_f32(tmp_max, vld1q_f32(src_win_ptr)); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + tmp_max = MS_MAXQ_F32(tmp_max, MS_LDQ_F32(src_win_ptr)); #else tmp_max1 = fmax(tmp_max1, src_win_ptr[0]); tmp_max2 = fmax(tmp_max2, src_win_ptr[1]); @@ -209,10 +210,10 @@ void MaxPooling(const float *input_ptr, float *output_ptr, const PoolingParamete #endif } // win_w loop } // win_h loop -#ifdef ENABLE_NEON - tmp_max = vmaxq_f32(tmp_max, min_value); - tmp_max = vminq_f32(tmp_max, max_value); - vst1q_f32(dst_c_ptr, tmp_max); +#if defined(ENABLE_NEON) || defined(ENALBE_SSE) + tmp_max = MS_MAXQ_F32(tmp_max, min_value); + tmp_max = MS_MINQ_F32(tmp_max, max_value); + MS_STQ_F32(dst_c_ptr, tmp_max); #else tmp_max1 = fmax(tmp_max1, minf); tmp_max2 = fmax(tmp_max2, minf); diff --git a/mindspore/lite/nnacl/winograd_transform.c b/mindspore/lite/nnacl/winograd_transform.c index 3abd21b58d..483755c808 100644 --- a/mindspore/lite/nnacl/winograd_transform.c +++ b/mindspore/lite/nnacl/winograd_transform.c @@ -15,6 +15,7 @@ */ #include "nnacl/winograd_transform.h" +#include "nnacl/op_base.h" // fp32 conv winograd void WinogradInputTransform(const float *input_data, float *trans_input, float *tmp_data, int cal_num, @@ -61,8 +62,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float * int dst_x_offset = dst_y_offset + j * C4NUM; float *src_addr = (float *)(input_data) + src_x_offset; float *dst_addr = tmp_data + dst_x_offset; -#ifdef ENABLE_NEON - vst1q_f32(dst_addr, vld1q_f32(src_addr)); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_STQ_F32(dst_addr, MS_LDQ_F32(src_addr)); #else for (int k = 0; k < C4NUM; k++) { dst_addr[k] = src_addr[k]; diff --git a/mindspore/lite/nnacl/winograd_utils.c b/mindspore/lite/nnacl/winograd_utils.c index 28f89e48fa..ed95361a4c 100644 --- a/mindspore/lite/nnacl/winograd_utils.c +++ b/mindspore/lite/nnacl/winograd_utils.c @@ -118,21 +118,21 @@ void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const fl if (src_len > MAX_LEN) { return; } -#ifdef ENABLE_ARM - float32x4_t src[MAX_LEN]; - float32x4_t t[MAX_LEN]; - float32x4_t m[MAX_LEN]; - float32x4_t vec_a[MAX_LEN]; - float32x4_t vec_at[MAX_LEN]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[MAX_LEN]; + MS_FLOAT32X4 t[MAX_LEN]; + MS_FLOAT32X4 m[MAX_LEN]; + MS_FLOAT32X4 vec_a[MAX_LEN]; + MS_FLOAT32X4 vec_at[MAX_LEN]; int tmp_len = in_unit * out_unit; if (tmp_len > MAX_LEN) return; for (int i = 0; i < tmp_len; i++) { - vec_a[i] = vdupq_n_f32(matrix_a[i]); - vec_at[i] = vdupq_n_f32(matrix_at[i]); + vec_a[i] = MS_MOVQ_F32(matrix_a[i]); + vec_at[i] = MS_MOVQ_F32(matrix_at[i]); } for (int i = 0; i < src_len; i++) { - src[i] = vld1q_f32(src_data + i * src_step); + src[i] = MS_LDQ_F32(src_data + i * src_step); } MatrixMultiplyVec(vec_at, src, t, NULL, out_unit, in_unit, in_unit); MatrixMultiplyVec(t, vec_a, m, bias_data, out_unit, in_unit, out_unit); @@ -141,7 +141,7 @@ void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const fl int dst_k_offset = i * dst_step * C4NUM; int m_k_offset = i * out_unit; for (int j = 0; j < out_unit; j++) { - vst1q_f32(dst_data + dst_k_offset + j * C4NUM, m[m_k_offset + j]); + MS_STQ_F32(dst_data + dst_k_offset + j * C4NUM, m[m_k_offset + j]); } } #else @@ -172,28 +172,28 @@ void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const fl InputTransFunc GetInputTransFunc(int input_unit) { return InputTransFuncList[input_unit]; } void InputTransform4x4Unit(const float *src_data, float *dst_data, int src_step, int dst_step, int real_c) { -#ifdef ENABLE_ARM +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) if (real_c == 4) { - float32x4_t src[16]; - float32x4_t t[16]; - float32x4_t m[16]; + MS_FLOAT32X4 src[16]; + MS_FLOAT32X4 t[16]; + MS_FLOAT32X4 m[16]; Load16Data; for (int l = 0; l < 4; ++l) { int offset = l * 4; - t[l] = vsubq_f32(src[offset], src[2 + offset]); - t[4 + l] = vaddq_f32(src[1 + offset], src[2 + offset]); - t[8 + l] = vsubq_f32(src[2 + offset], src[1 + offset]); - t[12 + l] = vsubq_f32(src[3 + offset], src[1 + offset]); + t[l] = MS_SUBQ_F32(src[offset], src[2 + offset]); + t[4 + l] = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + t[8 + l] = MS_SUBQ_F32(src[2 + offset], src[1 + offset]); + t[12 + l] = MS_SUBQ_F32(src[3 + offset], src[1 + offset]); } for (int l = 0; l < 4; ++l) { int offset = l * 4; - m[l] = vsubq_f32(t[offset], t[2 + offset]); - m[4 + l] = vaddq_f32(t[1 + offset], t[2 + offset]); - m[8 + l] = vsubq_f32(t[2 + offset], t[1 + offset]); - m[12 + l] = vsubq_f32(t[3 + offset], t[1 + offset]); + m[l] = MS_SUBQ_F32(t[offset], t[2 + offset]); + m[4 + l] = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + m[8 + l] = MS_SUBQ_F32(t[2 + offset], t[1 + offset]); + m[12 + l] = MS_SUBQ_F32(t[3 + offset], t[1 + offset]); } for (int i = 0; i < 16; i++) { - vst1q_f32(dst_data + i * dst_step, m[i]); + MS_STQ_F32(dst_data + i * dst_step, m[i]); } } else { #endif @@ -222,47 +222,47 @@ void InputTransform4x4Unit(const float *src_data, float *dst_data, int src_step, dst_data[i + k * dst_step] = m[k]; } } -#ifdef ENABLE_ARM +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) } #endif } void InputTransform6x6Unit(const float *src_data, float *dst_data, int src_step, int dst_step, int real_c) { -#ifdef ENABLE_ARM +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) if (real_c == 4) { - float32x4_t src[36]; - float32x4_t t[36]; - float32x4_t m[36]; + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[36]; + MS_FLOAT32X4 m[36]; Load36Data; for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vsubq_f32(src[3 + offset], src[1 + offset]); - float32x4_t tmp2 = vsubq_f32(src[4 + offset], src[2 + offset]); - t[l] = vaddq_f32(vsubq_f32(vmulq_n_f32(src[offset], 4), vmulq_n_f32(src[2 + offset], 5)), src[4 + offset]); - t[6 + l] = vaddq_f32(vmulq_n_f32(vaddq_f32(src[1 + offset], src[2 + offset]), -4), - vaddq_f32(src[3 + offset], src[4 + offset])); - t[12 + l] = vaddq_f32(vmulq_n_f32(vsubq_f32(src[1 + offset], src[2 + offset]), 4), - vsubq_f32(src[4 + offset], src[3 + offset])); - t[18 + l] = vaddq_f32(vmulq_n_f32(tmp1, 2), tmp2); - t[24 + l] = vaddq_f32(vmulq_n_f32(tmp1, -2), tmp2); + MS_FLOAT32X4 tmp1 = MS_SUBQ_F32(src[3 + offset], src[1 + offset]); + MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(src[4 + offset], src[2 + offset]); + t[l] = MS_ADDQ_F32(MS_SUBQ_F32(MS_MULQ_F32(src[offset], 4), MS_MULQ_F32(src[2 + offset], 5)), src[4 + offset]); + t[6 + l] = MS_ADDQ_F32(MS_MULQ_F32(MS_ADDQ_F32(src[1 + offset], src[2 + offset]), -4), + MS_ADDQ_F32(src[3 + offset], src[4 + offset])); + t[12 + l] = MS_ADDQ_F32(MS_MULQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), 4), + MS_SUBQ_F32(src[4 + offset], src[3 + offset])); + t[18 + l] = MS_ADDQ_F32(MS_MULQ_F32(tmp1, 2), tmp2); + t[24 + l] = MS_ADDQ_F32(MS_MULQ_F32(tmp1, -2), tmp2); t[30 + l] = - vaddq_f32(vsubq_f32(vmulq_n_f32(src[1 + offset], 4), vmulq_n_f32(src[3 + offset], 5)), src[5 + offset]); + MS_ADDQ_F32(MS_SUBQ_F32(MS_MULQ_F32(src[1 + offset], 4), MS_MULQ_F32(src[3 + offset], 5)), src[5 + offset]); } for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vsubq_f32(t[3 + offset], t[1 + offset]); - float32x4_t tmp2 = vsubq_f32(t[4 + offset], t[2 + offset]); - m[l] = vaddq_f32(vsubq_f32(vmulq_n_f32(t[offset], 4), vmulq_n_f32(t[2 + offset], 5)), t[4 + offset]); - m[6 + l] = - vaddq_f32(vmulq_n_f32(vaddq_f32(t[1 + offset], t[2 + offset]), -4), vaddq_f32(t[3 + offset], t[4 + offset])); - m[12 + l] = - vaddq_f32(vmulq_n_f32(vsubq_f32(t[1 + offset], t[2 + offset]), 4), vsubq_f32(t[4 + offset], t[3 + offset])); - m[18 + l] = vaddq_f32(vmulq_n_f32(tmp1, 2), tmp2); - m[24 + l] = vaddq_f32(vmulq_n_f32(tmp1, -2), tmp2); - m[30 + l] = vaddq_f32(vsubq_f32(vmulq_n_f32(t[1 + offset], 4), vmulq_n_f32(t[3 + offset], 5)), t[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_SUBQ_F32(t[3 + offset], t[1 + offset]); + MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(t[4 + offset], t[2 + offset]); + m[l] = MS_ADDQ_F32(MS_SUBQ_F32(MS_MULQ_F32(t[offset], 4), MS_MULQ_F32(t[2 + offset], 5)), t[4 + offset]); + m[6 + l] = MS_ADDQ_F32(MS_MULQ_F32(MS_ADDQ_F32(t[1 + offset], t[2 + offset]), -4), + MS_ADDQ_F32(t[3 + offset], t[4 + offset])); + m[12 + l] = MS_ADDQ_F32(MS_MULQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), 4), + MS_SUBQ_F32(t[4 + offset], t[3 + offset])); + m[18 + l] = MS_ADDQ_F32(MS_MULQ_F32(tmp1, 2), tmp2); + m[24 + l] = MS_ADDQ_F32(MS_MULQ_F32(tmp1, -2), tmp2); + m[30 + l] = MS_ADDQ_F32(MS_SUBQ_F32(MS_MULQ_F32(t[1 + offset], 4), MS_MULQ_F32(t[3 + offset], 5)), t[5 + offset]); } for (int i = 0; i < 36; i++) { - vst1q_f32(dst_data + i * dst_step, m[i]); + MS_STQ_F32(dst_data + i * dst_step, m[i]); } } else { #endif @@ -299,64 +299,69 @@ void InputTransform6x6Unit(const float *src_data, float *dst_data, int src_step, dst_data[i + k * dst_step] = m[k]; } } -#ifdef ENABLE_ARM +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) } #endif } void InputTransform8x8Unit(const float *src_data, float *dst_data, int src_step, int dst_step, int real_c) { -#ifdef ENABLE_ARM +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) if (real_c == 4) { - float32x4_t src[64]; - float32x4_t t[64]; - float32x4_t m[64]; + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[64]; + MS_FLOAT32X4 m[64]; Load64Data; for (int l = 0; l < 8; ++l) { int offset = l * 8; - t[l] = vsubq_f32(vaddq_f32(vsubq_f32(vmulq_n_f32(src[offset], 0.5625), vmulq_n_f32(src[2 + offset], 3.0625)), - vmulq_n_f32(src[4 + offset], 3.5)), - src[6 + offset]); - float32x4_t tmp1 = vaddq_f32(vmulq_n_f32(src[1 + offset], 1.125), vmulq_n_f32(src[5 + offset], 0.5)); - float32x4_t tmp2 = vsubq_f32(vmulq_n_f32(src[2 + offset], 2.25), vmulq_n_f32(src[4 + offset], 3.25)); - t[8 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(src[3 + offset], 1.625)), src[6 + offset]); - t[16 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(src[3 + offset], 1.625)), src[6 + offset]); - tmp1 = vaddq_f32(vmulq_n_f32(src[1 + offset], 0.5625), src[5 + offset]); - tmp2 = vsubq_f32(vmulq_n_f32(src[2 + offset], 0.5625), vmulq_n_f32(src[4 + offset], 2.5)); - t[24 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(src[3 + offset], 2.5)), src[6 + offset]); - t[32 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(src[3 + offset], 2.5)), src[6 + offset]); - tmp1 = vaddq_f32(vmulq_n_f32(src[1 + offset], 0.375), vmulq_n_f32(src[5 + offset], 1.5)); - tmp2 = vsubq_f32(vmulq_n_f32(src[2 + offset], 0.25), vmulq_n_f32(src[4 + offset], 1.25)); - t[40 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(src[3 + offset], 1.875)), src[6 + offset]); - t[48 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(src[3 + offset], 1.875)), src[6 + offset]); - t[56 + l] = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(src[1 + offset], -0.5625), vmulq_n_f32(src[3 + offset], 3.0625)), - vmulq_n_f32(src[5 + offset], 3.5)), - src[7 + offset]); + t[l] = + MS_SUBQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(MS_MULQ_F32(src[offset], 0.5625), MS_MULQ_F32(src[2 + offset], 3.0625)), + MS_MULQ_F32(src[4 + offset], 3.5)), + src[6 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(MS_MULQ_F32(src[1 + offset], 1.125), MS_MULQ_F32(src[5 + offset], 0.5)); + MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(MS_MULQ_F32(src[2 + offset], 2.25), MS_MULQ_F32(src[4 + offset], 3.25)); + t[8 + l] = + MS_ADDQ_F32(MS_SUBQ_F32(MS_ADDQ_F32(tmp1, tmp2), MS_MULQ_F32(src[3 + offset], 1.625)), src[6 + offset]); + t[16 + l] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(tmp2, tmp1), MS_MULQ_F32(src[3 + offset], 1.625)), src[6 + offset]); + tmp1 = MS_ADDQ_F32(MS_MULQ_F32(src[1 + offset], 0.5625), src[5 + offset]); + tmp2 = MS_SUBQ_F32(MS_MULQ_F32(src[2 + offset], 0.5625), MS_MULQ_F32(src[4 + offset], 2.5)); + t[24 + l] = MS_ADDQ_F32(MS_SUBQ_F32(MS_ADDQ_F32(tmp1, tmp2), MS_MULQ_F32(src[3 + offset], 2.5)), src[6 + offset]); + t[32 + l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(tmp2, tmp1), MS_MULQ_F32(src[3 + offset], 2.5)), src[6 + offset]); + tmp1 = MS_ADDQ_F32(MS_MULQ_F32(src[1 + offset], 0.375), MS_MULQ_F32(src[5 + offset], 1.5)); + tmp2 = MS_SUBQ_F32(MS_MULQ_F32(src[2 + offset], 0.25), MS_MULQ_F32(src[4 + offset], 1.25)); + t[40 + l] = + MS_ADDQ_F32(MS_SUBQ_F32(MS_ADDQ_F32(tmp1, tmp2), MS_MULQ_F32(src[3 + offset], 1.875)), src[6 + offset]); + t[48 + l] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(tmp2, tmp1), MS_MULQ_F32(src[3 + offset], 1.875)), src[6 + offset]); + t[56 + l] = MS_ADDQ_F32( + MS_SUBQ_F32(MS_ADDQ_F32(MS_MULQ_F32(src[1 + offset], -0.5625), MS_MULQ_F32(src[3 + offset], 3.0625)), + MS_MULQ_F32(src[5 + offset], 3.5)), + src[7 + offset]); } for (int l = 0; l < 8; ++l) { int offset = l * 8; - m[l] = vsubq_f32(vaddq_f32(vsubq_f32(vmulq_n_f32(t[offset], 0.5625), vmulq_n_f32(t[2 + offset], 3.0625)), - vmulq_n_f32(t[4 + offset], 3.5)), - t[6 + offset]); - float32x4_t tmp1 = vaddq_f32(vmulq_n_f32(t[1 + offset], 1.125), vmulq_n_f32(t[5 + offset], 0.5)); - float32x4_t tmp2 = vsubq_f32(vmulq_n_f32(t[2 + offset], 2.25), vmulq_n_f32(t[4 + offset], 3.25)); - m[8 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(t[3 + offset], 1.625)), t[6 + offset]); - m[16 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(t[3 + offset], 1.625)), t[6 + offset]); - tmp1 = vaddq_f32(vmulq_n_f32(t[1 + offset], 0.5625), t[5 + offset]); - tmp2 = vsubq_f32(vmulq_n_f32(t[2 + offset], 0.5625), vmulq_n_f32(t[4 + offset], 2.5)); - m[24 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(t[3 + offset], 2.5)), t[6 + offset]); - m[32 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(t[3 + offset], 2.5)), t[6 + offset]); - tmp1 = vaddq_f32(vmulq_n_f32(t[1 + offset], 0.375), vmulq_n_f32(t[5 + offset], 1.5)); - tmp2 = vsubq_f32(vmulq_n_f32(t[2 + offset], 0.25), vmulq_n_f32(t[4 + offset], 1.25)); - m[40 + l] = vaddq_f32(vsubq_f32(vaddq_f32(tmp1, tmp2), vmulq_n_f32(t[3 + offset], 1.875)), t[6 + offset]); - m[48 + l] = vaddq_f32(vaddq_f32(vsubq_f32(tmp2, tmp1), vmulq_n_f32(t[3 + offset], 1.875)), t[6 + offset]); + m[l] = MS_SUBQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(MS_MULQ_F32(t[offset], 0.5625), MS_MULQ_F32(t[2 + offset], 3.0625)), + MS_MULQ_F32(t[4 + offset], 3.5)), + t[6 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(MS_MULQ_F32(t[1 + offset], 1.125), MS_MULQ_F32(t[5 + offset], 0.5)); + MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(MS_MULQ_F32(t[2 + offset], 2.25), MS_MULQ_F32(t[4 + offset], 3.25)); + m[8 + l] = MS_ADDQ_F32(MS_SUBQ_F32(MS_ADDQ_F32(tmp1, tmp2), MS_MULQ_F32(t[3 + offset], 1.625)), t[6 + offset]); + m[16 + l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(tmp2, tmp1), MS_MULQ_F32(t[3 + offset], 1.625)), t[6 + offset]); + tmp1 = MS_ADDQ_F32(MS_MULQ_F32(t[1 + offset], 0.5625), t[5 + offset]); + tmp2 = MS_SUBQ_F32(MS_MULQ_F32(t[2 + offset], 0.5625), MS_MULQ_F32(t[4 + offset], 2.5)); + m[24 + l] = MS_ADDQ_F32(MS_SUBQ_F32(MS_ADDQ_F32(tmp1, tmp2), MS_MULQ_F32(t[3 + offset], 2.5)), t[6 + offset]); + m[32 + l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(tmp2, tmp1), MS_MULQ_F32(t[3 + offset], 2.5)), t[6 + offset]); + tmp1 = MS_ADDQ_F32(MS_MULQ_F32(t[1 + offset], 0.375), MS_MULQ_F32(t[5 + offset], 1.5)); + tmp2 = MS_SUBQ_F32(MS_MULQ_F32(t[2 + offset], 0.25), MS_MULQ_F32(t[4 + offset], 1.25)); + m[40 + l] = MS_ADDQ_F32(MS_SUBQ_F32(MS_ADDQ_F32(tmp1, tmp2), MS_MULQ_F32(t[3 + offset], 1.875)), t[6 + offset]); + m[48 + l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(tmp2, tmp1), MS_MULQ_F32(t[3 + offset], 1.875)), t[6 + offset]); m[56 + l] = - vaddq_f32(vsubq_f32(vaddq_f32(vmulq_n_f32(t[1 + offset], -0.5625), vmulq_n_f32(t[3 + offset], 3.0625)), - vmulq_n_f32(t[5 + offset], 3.5)), - t[7 + offset]); + MS_ADDQ_F32(MS_SUBQ_F32(MS_ADDQ_F32(MS_MULQ_F32(t[1 + offset], -0.5625), MS_MULQ_F32(t[3 + offset], 3.0625)), + MS_MULQ_F32(t[5 + offset], 3.5)), + t[7 + offset]); } for (int i = 0; i < 64; i++) { - vst1q_f32(dst_data + i * dst_step, m[i]); + MS_STQ_F32(dst_data + i * dst_step, m[i]); } } else { #endif @@ -405,7 +410,7 @@ void InputTransform8x8Unit(const float *src_data, float *dst_data, int src_step, dst_data[i + k * dst_step] = m[k]; } } -#ifdef ENABLE_ARM +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) } #endif } @@ -442,21 +447,21 @@ OutputTransFunc GetOutputTransFunc(int input_unit, int output_unit, ActType act_ void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[16]; - float32x4_t t[8]; - float32x4_t m[4]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[16]; + MS_FLOAT32X4 t[8]; + MS_FLOAT32X4 m[4]; Load16Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 4; ++l) { int offset = l * 4; - t[l] = vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]); - t[l + 4] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), src[3 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], src[1 + offset]), src[2 + offset]); + t[l + 4] = MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), src[3 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 4; - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), bias_ptr); - m[l + 2] = vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), t[3 + offset]), bias_ptr); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], t[1 + offset]), t[2 + offset]), bias_ptr); + m[l + 2] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), t[3 + offset]), bias_ptr); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -504,24 +509,24 @@ void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float void OutputTransform4x2ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[16]; - float32x4_t t[8]; - float32x4_t m[4]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[16]; + MS_FLOAT32X4 t[8]; + MS_FLOAT32X4 m[4]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load16Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 4; ++l) { int offset = l * 4; - t[l] = vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]); - t[l + 4] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), src[3 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], src[1 + offset]), src[2 + offset]); + t[l + 4] = MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), src[3 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 4; - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), bias_ptr); - m[l + 2] = vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), t[3 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 2] = vmaxq_f32(zero, m[l + 2]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], t[1 + offset]), t[2 + offset]), bias_ptr); + m[l + 2] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), t[3 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 2] = MS_MAXQ_F32(zero, m[l + 2]); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -571,27 +576,27 @@ void OutputTransform4x2ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform4x2Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[16]; - float32x4_t t[8]; - float32x4_t m[4]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[16]; + MS_FLOAT32X4 t[8]; + MS_FLOAT32X4 m[4]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load16Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 4; ++l) { int offset = l * 4; - t[l] = vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]); - t[l + 4] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), src[3 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], src[1 + offset]), src[2 + offset]); + t[l + 4] = MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), src[3 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 4; - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), bias_ptr); - m[l + 2] = vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), t[3 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 2] = vmaxq_f32(zero, m[l + 2]); - m[l + 2] = vminq_f32(six, m[l + 2]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], t[1 + offset]), t[2 + offset]), bias_ptr); + m[l + 2] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), t[3 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 2] = MS_MAXQ_F32(zero, m[l + 2]); + m[l + 2] = MS_MINQ_F32(six, m[l + 2]); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -642,25 +647,25 @@ void OutputTransform4x2Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform4x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[16]; - float32x4_t t[12]; - float32x4_t m[9]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[16]; + MS_FLOAT32X4 t[12]; + MS_FLOAT32X4 m[9]; Load16Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 4; ++l) { int offset = l * 4; - float32x4_t tmp = vaddq_f32(src[1 + offset], src[2 + offset]); - t[l] = vaddq_f32(src[offset], tmp); - t[l + 4] = vsubq_f32(src[1 + offset], src[2 + offset]); - t[l + 8] = vaddq_f32(tmp, src[3 + offset]); + MS_FLOAT32X4 tmp = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + t[l] = MS_ADDQ_F32(src[offset], tmp); + t[l + 4] = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + t[l + 8] = MS_ADDQ_F32(tmp, src[3 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 4; - float32x4_t tmp = vaddq_f32(t[1 + offset], t[2 + offset]); - m[l] = vaddq_f32(vaddq_f32(t[offset], tmp), bias_ptr); - m[l + 3] = vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(tmp, t[3 + offset]), bias_ptr); + MS_FLOAT32X4 tmp = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp), bias_ptr); + m[l + 3] = MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), bias_ptr); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(tmp, t[3 + offset]), bias_ptr); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -710,29 +715,29 @@ void OutputTransform4x3Unit(const float *src_data, float *dst_data, const float void OutputTransform4x3ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[16]; - float32x4_t t[12]; - float32x4_t m[9]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[16]; + MS_FLOAT32X4 t[12]; + MS_FLOAT32X4 m[9]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load16Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 4; ++l) { int offset = l * 4; - float32x4_t tmp = vaddq_f32(src[1 + offset], src[2 + offset]); - t[l] = vaddq_f32(src[offset], tmp); - t[l + 4] = vsubq_f32(src[1 + offset], src[2 + offset]); - t[l + 8] = vaddq_f32(tmp, src[3 + offset]); + MS_FLOAT32X4 tmp = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + t[l] = MS_ADDQ_F32(src[offset], tmp); + t[l + 4] = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + t[l + 8] = MS_ADDQ_F32(tmp, src[3 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 4; - float32x4_t tmp = vaddq_f32(t[1 + offset], t[2 + offset]); - m[l] = vaddq_f32(vaddq_f32(t[offset], tmp), bias_ptr); - m[l + 3] = vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(tmp, t[3 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 3] = vmaxq_f32(zero, m[l + 3]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); + MS_FLOAT32X4 tmp = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp), bias_ptr); + m[l + 3] = MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), bias_ptr); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(tmp, t[3 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 3] = MS_MAXQ_F32(zero, m[l + 3]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -784,33 +789,33 @@ void OutputTransform4x3ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform4x3Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[16]; - float32x4_t t[12]; - float32x4_t m[9]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[16]; + MS_FLOAT32X4 t[12]; + MS_FLOAT32X4 m[9]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load16Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 4; ++l) { int offset = l * 4; - float32x4_t tmp = vaddq_f32(src[1 + offset], src[2 + offset]); - t[l] = vaddq_f32(src[offset], tmp); - t[l + 4] = vsubq_f32(src[1 + offset], src[2 + offset]); - t[l + 8] = vaddq_f32(tmp, src[3 + offset]); + MS_FLOAT32X4 tmp = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + t[l] = MS_ADDQ_F32(src[offset], tmp); + t[l + 4] = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + t[l + 8] = MS_ADDQ_F32(tmp, src[3 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 4; - float32x4_t tmp = vaddq_f32(t[1 + offset], t[2 + offset]); - m[l] = vaddq_f32(vaddq_f32(t[offset], tmp), bias_ptr); - m[l + 3] = vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(tmp, t[3 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 3] = vmaxq_f32(zero, m[l + 3]); - m[l + 3] = vminq_f32(six, m[l + 3]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); - m[l + 6] = vminq_f32(six, m[l + 6]); + MS_FLOAT32X4 tmp = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp), bias_ptr); + m[l + 3] = MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), bias_ptr); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(tmp, t[3 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 3] = MS_MAXQ_F32(zero, m[l + 3]); + m[l + 3] = MS_MINQ_F32(six, m[l + 3]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); + m[l + 6] = MS_MINQ_F32(six, m[l + 6]); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -863,29 +868,31 @@ void OutputTransform4x3Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform6x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[12]; - float32x4_t m[4]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[12]; + MS_FLOAT32X4 m[4]; Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]), src[3 + offset]), - src[4 + offset]); - t[l + 6] = vaddq_f32(vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), - vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)), - src[5 + offset]); + t[l] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], src[1 + offset]), src[2 + offset]), src[3 + offset]), + src[4 + offset]); + t[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(src[3 + offset], src[4 + offset]), 2)), + src[5 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 6; - m[l] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), t[4 + offset]), + m[l] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), + t[4 + offset]), bias_ptr); - m[l + 2] = vaddq_f32(vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), - vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), - t[5 + offset]), - bias_ptr); + m[l + 2] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(t[3 + offset], t[4 + offset]), 2)), + t[5 + offset]), + bias_ptr); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -933,32 +940,34 @@ void OutputTransform6x2Unit(const float *src_data, float *dst_data, const float void OutputTransform6x2ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[12]; - float32x4_t m[4]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[12]; + MS_FLOAT32X4 m[4]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]), src[3 + offset]), - src[4 + offset]); - t[l + 6] = vaddq_f32(vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), - vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)), - src[5 + offset]); + t[l] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], src[1 + offset]), src[2 + offset]), src[3 + offset]), + src[4 + offset]); + t[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(src[3 + offset], src[4 + offset]), 2)), + src[5 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 6; - m[l] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), t[4 + offset]), + m[l] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), + t[4 + offset]), bias_ptr); - m[l + 2] = vaddq_f32(vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), - vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), - t[5 + offset]), - bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 2] = vmaxq_f32(zero, m[l + 2]); + m[l + 2] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(t[3 + offset], t[4 + offset]), 2)), + t[5 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 2] = MS_MAXQ_F32(zero, m[l + 2]); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -1008,35 +1017,37 @@ void OutputTransform6x2ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform6x2Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[12]; - float32x4_t m[4]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[12]; + MS_FLOAT32X4 m[4]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(src[offset], src[1 + offset]), src[2 + offset]), src[3 + offset]), - src[4 + offset]); - t[l + 6] = vaddq_f32(vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), - vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)), - src[5 + offset]); + t[l] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], src[1 + offset]), src[2 + offset]), src[3 + offset]), + src[4 + offset]); + t[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(src[3 + offset], src[4 + offset]), 2)), + src[5 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 6; - m[l] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), t[4 + offset]), + m[l] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], t[1 + offset]), t[2 + offset]), t[3 + offset]), + t[4 + offset]), bias_ptr); - m[l + 2] = vaddq_f32(vaddq_f32(vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), - vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), - t[5 + offset]), - bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 2] = vmaxq_f32(zero, m[l + 2]); - m[l + 2] = vminq_f32(six, m[l + 2]); + m[l + 2] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(t[3 + offset], t[4 + offset]), 2)), + t[5 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 2] = MS_MAXQ_F32(zero, m[l + 2]); + m[l + 2] = MS_MINQ_F32(six, m[l + 2]); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -1087,30 +1098,30 @@ void OutputTransform6x2Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform6x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[18]; - float32x4_t m[9]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[18]; + MS_FLOAT32X4 m[9]; Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), - vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)); - t[l + 12] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(src[3 + offset], src[4 + offset]), 2)); + t[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), src[5 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 3] = vaddq_f32( - vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 3] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), MS_MULQ_F32(MS_SUBQ_F32(t[3 + offset], t[4 + offset]), 2)), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), t[5 + offset]), bias_ptr); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), t[5 + offset]), bias_ptr); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -1160,34 +1171,34 @@ void OutputTransform6x3Unit(const float *src_data, float *dst_data, const float void OutputTransform6x3ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[18]; - float32x4_t m[9]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[18]; + MS_FLOAT32X4 m[9]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), - vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)); - t[l + 12] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(src[3 + offset], src[4 + offset]), 2)); + t[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), src[5 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 3] = vaddq_f32( - vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 3] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), MS_MULQ_F32(MS_SUBQ_F32(t[3 + offset], t[4 + offset]), 2)), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), t[5 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 3] = vmaxq_f32(zero, m[l + 3]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), t[5 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 3] = MS_MAXQ_F32(zero, m[l + 3]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -1239,38 +1250,38 @@ void OutputTransform6x3ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform6x3Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[18]; - float32x4_t m[9]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[18]; + MS_FLOAT32X4 m[9]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(vsubq_f32(src[1 + offset], src[2 + offset]), - vmulq_n_f32(vsubq_f32(src[3 + offset], src[4 + offset]), 2)); - t[l + 12] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(MS_SUBQ_F32(src[1 + offset], src[2 + offset]), + MS_MULQ_F32(MS_SUBQ_F32(src[3 + offset], src[4 + offset]), 2)); + t[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), src[5 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 3] = vaddq_f32( - vaddq_f32(vsubq_f32(t[1 + offset], t[2 + offset]), vmulq_n_f32(vsubq_f32(t[3 + offset], t[4 + offset]), 2)), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 3] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_SUBQ_F32(t[1 + offset], t[2 + offset]), MS_MULQ_F32(MS_SUBQ_F32(t[3 + offset], t[4 + offset]), 2)), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), t[5 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 3] = vmaxq_f32(zero, m[l + 3]); - m[l + 3] = vminq_f32(six, m[l + 3]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); - m[l + 6] = vminq_f32(six, m[l + 6]); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), t[5 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 3] = MS_MAXQ_F32(zero, m[l + 3]); + m[l + 3] = MS_MINQ_F32(six, m[l + 3]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); + m[l + 6] = MS_MINQ_F32(six, m[l + 6]); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -1323,33 +1334,33 @@ void OutputTransform6x3Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform6x4Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[24]; - float32x4_t m[16]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[24]; + MS_FLOAT32X4 m[16]; Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); - t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); - t[l + 18] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)); + t[l + 12] = MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)); + t[l + 18] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), src[5 + offset]); } for (int l = 0; l < 4; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 4] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); - m[l + 8] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); - m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), t[5 + offset]), bias_ptr); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 4] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)), bias_ptr); + m[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), bias_ptr); + m[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), t[5 + offset]), bias_ptr); } if (r_c == C4NUM && r_h == 4 && r_w == 4) { Store16Data; @@ -1401,38 +1412,38 @@ void OutputTransform6x4Unit(const float *src_data, float *dst_data, const float void OutputTransform6x4ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[24]; - float32x4_t m[16]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[24]; + MS_FLOAT32X4 m[16]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); - t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); - t[l + 18] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)); + t[l + 12] = MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)); + t[l + 18] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), src[5 + offset]); } for (int l = 0; l < 4; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 4] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); - m[l + 8] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); - m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), t[5 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 4] = vmaxq_f32(zero, m[l + 4]); - m[l + 8] = vmaxq_f32(zero, m[l + 8]); - m[l + 12] = vmaxq_f32(zero, m[l + 12]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 4] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)), bias_ptr); + m[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), bias_ptr); + m[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), t[5 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 4] = MS_MAXQ_F32(zero, m[l + 4]); + m[l + 8] = MS_MAXQ_F32(zero, m[l + 8]); + m[l + 12] = MS_MAXQ_F32(zero, m[l + 12]); } if (r_c == C4NUM && r_h == 4 && r_w == 4) { Store16Data; @@ -1486,43 +1497,43 @@ void OutputTransform6x4ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform6x4Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[24]; - float32x4_t m[16]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[24]; + MS_FLOAT32X4 m[16]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); - t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); - t[l + 18] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)); + t[l + 12] = MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)); + t[l + 18] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), src[5 + offset]); } for (int l = 0; l < 4; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 4] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); - m[l + 8] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); - m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), t[5 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 4] = vmaxq_f32(zero, m[l + 4]); - m[l + 4] = vminq_f32(six, m[l + 4]); - m[l + 8] = vmaxq_f32(zero, m[l + 8]); - m[l + 8] = vminq_f32(six, m[l + 8]); - m[l + 12] = vmaxq_f32(zero, m[l + 12]); - m[l + 12] = vminq_f32(six, m[l + 12]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 4] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)), bias_ptr); + m[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), bias_ptr); + m[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), t[5 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 4] = MS_MAXQ_F32(zero, m[l + 4]); + m[l + 4] = MS_MINQ_F32(six, m[l + 4]); + m[l + 8] = MS_MAXQ_F32(zero, m[l + 8]); + m[l + 8] = MS_MINQ_F32(six, m[l + 8]); + m[l + 12] = MS_MAXQ_F32(zero, m[l + 12]); + m[l + 12] = MS_MINQ_F32(six, m[l + 12]); } if (r_c == C4NUM && r_h == 4 && r_w == 4) { Store16Data; @@ -1577,35 +1588,35 @@ void OutputTransform6x4Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform6x5Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[30]; - float32x4_t m[25]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[30]; + MS_FLOAT32X4 m[25]; Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); - t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); - t[l + 18] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)); - t[l + 24] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)); + t[l + 12] = MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)); + t[l + 18] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 16)), src[5 + offset]); } for (int l = 0; l < 5; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 5] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); - m[l + 10] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); - m[l + 15] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), bias_ptr); - m[l + 20] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), t[5 + offset]), bias_ptr); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 5] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)), bias_ptr); + m[l + 10] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), bias_ptr); + m[l + 15] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), bias_ptr); + m[l + 20] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 16)), t[5 + offset]), bias_ptr); } if (r_c == C4NUM && r_h == 5 && r_w == 5) { Store25Data; @@ -1659,41 +1670,41 @@ void OutputTransform6x5Unit(const float *src_data, float *dst_data, const float void OutputTransform6x5ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[30]; - float32x4_t m[25]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[30]; + MS_FLOAT32X4 m[25]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); - t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); - t[l + 18] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)); - t[l + 24] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)); + t[l + 12] = MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)); + t[l + 18] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 16)), src[5 + offset]); } for (int l = 0; l < 5; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 5] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); - m[l + 10] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); - m[l + 15] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), bias_ptr); - m[l + 20] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), t[5 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 5] = vmaxq_f32(zero, m[l + 5]); - m[l + 10] = vmaxq_f32(zero, m[l + 10]); - m[l + 15] = vmaxq_f32(zero, m[l + 15]); - m[l + 20] = vmaxq_f32(zero, m[l + 20]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 5] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)), bias_ptr); + m[l + 10] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), bias_ptr); + m[l + 15] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), bias_ptr); + m[l + 20] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 16)), t[5 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 5] = MS_MAXQ_F32(zero, m[l + 5]); + m[l + 10] = MS_MAXQ_F32(zero, m[l + 10]); + m[l + 15] = MS_MAXQ_F32(zero, m[l + 15]); + m[l + 20] = MS_MAXQ_F32(zero, m[l + 20]); } if (r_c == C4NUM && r_h == 5 && r_w == 5) { Store25Data; @@ -1749,47 +1760,47 @@ void OutputTransform6x5ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform6x5Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[36]; - float32x4_t t[30]; - float32x4_t m[25]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[36]; + MS_FLOAT32X4 t[30]; + MS_FLOAT32X4 m[25]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load36Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 6; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp4 = vsubq_f32(src[3 + offset], src[4 + offset]); - t[l] = vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2); - t[l + 6] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)); - t[l + 12] = vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)); - t[l + 18] = vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)); - t[l + 24] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), src[5 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2); + t[l + 6] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)); + t[l + 12] = MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)); + t[l + 18] = MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 16)), src[5 + offset]); } for (int l = 0; l < 5; ++l) { int offset = l * 6; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp4 = vsubq_f32(t[3 + offset], t[4 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), bias_ptr); - m[l + 5] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 2)), bias_ptr); - m[l + 10] = vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 4)), bias_ptr); - m[l + 15] = vaddq_f32(vaddq_f32(tmp3, vmulq_n_f32(tmp4, 8)), bias_ptr); - m[l + 20] = vaddq_f32(vaddq_f32(vaddq_f32(tmp1, vmulq_n_f32(tmp2, 16)), t[5 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 5] = vmaxq_f32(zero, m[l + 5]); - m[l + 5] = vminq_f32(six, m[l + 5]); - m[l + 10] = vmaxq_f32(zero, m[l + 10]); - m[l + 10] = vminq_f32(six, m[l + 10]); - m[l + 15] = vmaxq_f32(zero, m[l + 15]); - m[l + 15] = vminq_f32(six, m[l + 15]); - m[l + 20] = vmaxq_f32(zero, m[l + 20]); - m[l + 20] = vminq_f32(six, m[l + 20]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), bias_ptr); + m[l + 5] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 2)), bias_ptr); + m[l + 10] = MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 4)), bias_ptr); + m[l + 15] = MS_ADDQ_F32(MS_ADDQ_F32(tmp3, MS_MULQ_F32(tmp4, 8)), bias_ptr); + m[l + 20] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(tmp1, MS_MULQ_F32(tmp2, 16)), t[5 + offset]), bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 5] = MS_MAXQ_F32(zero, m[l + 5]); + m[l + 5] = MS_MINQ_F32(six, m[l + 5]); + m[l + 10] = MS_MAXQ_F32(zero, m[l + 10]); + m[l + 10] = MS_MINQ_F32(six, m[l + 10]); + m[l + 15] = MS_MAXQ_F32(zero, m[l + 15]); + m[l + 15] = MS_MINQ_F32(six, m[l + 15]); + m[l + 20] = MS_MAXQ_F32(zero, m[l + 20]); + m[l + 20] = MS_MINQ_F32(six, m[l + 20]); } if (r_c == C4NUM && r_h == 5 && r_w == 5) { Store25Data; @@ -1846,34 +1857,36 @@ void OutputTransform6x5Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform8x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[16]; - float32x4_t m[4]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[16]; + MS_FLOAT32X4 m[4]; Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), src[7 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 2] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), t[7 + offset]), bias_ptr); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 2] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), t[7 + offset]), + bias_ptr); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -1924,37 +1937,39 @@ void OutputTransform8x2Unit(const float *src_data, float *dst_data, const float void OutputTransform8x2ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[16]; - float32x4_t m[4]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[16]; + MS_FLOAT32X4 m[4]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), src[7 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 2] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 2] = vmaxq_f32(zero, m[l + 2]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 2] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 2] = MS_MAXQ_F32(zero, m[l + 2]); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -2007,40 +2022,42 @@ void OutputTransform8x2ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform8x2Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[16]; - float32x4_t m[4]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[16]; + MS_FLOAT32X4 m[4]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), src[7 + offset]); } for (int l = 0; l < 2; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 2] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 2] = vmaxq_f32(zero, m[l + 2]); - m[l + 2] = vminq_f32(six, m[l + 2]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 2] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 2] = MS_MAXQ_F32(zero, m[l + 2]); + m[l + 2] = MS_MINQ_F32(six, m[l + 2]); } if (r_c == C4NUM && r_h == 2 && r_w == 2) { Store4Data; @@ -2094,37 +2111,38 @@ void OutputTransform8x2Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform8x3Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[24]; - float32x4_t m[9]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[24]; + MS_FLOAT32X4 m[9]; Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); t[l + 16] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), src[7 + offset]); + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), src[7 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 3] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 6] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), t[7 + offset]), bias_ptr); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 3] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 6] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), t[7 + offset]), + bias_ptr); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -2179,41 +2197,42 @@ void OutputTransform8x3Unit(const float *src_data, float *dst_data, const float void OutputTransform8x3ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[24]; - float32x4_t m[9]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[24]; + MS_FLOAT32X4 m[9]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); t[l + 16] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), src[7 + offset]); + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), src[7 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 3] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 6] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 3] = vmaxq_f32(zero, m[l + 3]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 3] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 6] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 3] = MS_MAXQ_F32(zero, m[l + 3]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -2270,45 +2289,46 @@ void OutputTransform8x3ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform8x3Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[24]; - float32x4_t m[9]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[24]; + MS_FLOAT32X4 m[9]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); t[l + 16] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), src[7 + offset]); + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), src[7 + offset]); } for (int l = 0; l < 3; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 3] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 6] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 3] = vmaxq_f32(zero, m[l + 3]); - m[l + 3] = vminq_f32(six, m[l + 3]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); - m[l + 6] = vminq_f32(six, m[l + 6]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 3] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 6] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 3] = MS_MAXQ_F32(zero, m[l + 3]); + m[l + 3] = MS_MINQ_F32(six, m[l + 3]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); + m[l + 6] = MS_MINQ_F32(six, m[l + 6]); } if (r_c == C4NUM && r_h == 3 && r_w == 3) { Store9Data; @@ -2366,39 +2386,39 @@ void OutputTransform8x3Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform8x4Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[32]; - float32x4_t m[16]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[32]; + MS_FLOAT32X4 m[16]; Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); t[l + 24] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), src[7 + offset]); + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), src[7 + offset]); } for (int l = 0; l < 4; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 4] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 12] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), t[7 + offset]), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 4] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 12] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), t[7 + offset]), bias_ptr); } if (r_c == C4NUM && r_h == 4 && r_w == 4) { @@ -2458,45 +2478,45 @@ void OutputTransform8x4Unit(const float *src_data, float *dst_data, const float void OutputTransform8x4ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[32]; - float32x4_t m[16]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[32]; + MS_FLOAT32X4 m[16]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); t[l + 24] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), src[7 + offset]); + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), src[7 + offset]); } for (int l = 0; l < 4; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 4] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 12] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), t[7 + offset]), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 4] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 12] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 4] = vmaxq_f32(zero, m[l + 4]); - m[l + 8] = vmaxq_f32(zero, m[l + 8]); - m[l + 12] = vmaxq_f32(zero, m[l + 12]); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 4] = MS_MAXQ_F32(zero, m[l + 4]); + m[l + 8] = MS_MAXQ_F32(zero, m[l + 8]); + m[l + 12] = MS_MAXQ_F32(zero, m[l + 12]); } if (r_c == C4NUM && r_h == 4 && r_w == 4) { Store16Data; @@ -2557,50 +2577,50 @@ void OutputTransform8x4ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform8x4Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[32]; - float32x4_t m[16]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[32]; + MS_FLOAT32X4 m[16]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); t[l + 24] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), src[7 + offset]); + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), src[7 + offset]); } for (int l = 0; l < 4; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 4] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 8] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 12] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), t[7 + offset]), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 4] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 12] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 4] = vmaxq_f32(zero, m[l + 4]); - m[l + 4] = vminq_f32(six, m[l + 4]); - m[l + 8] = vmaxq_f32(zero, m[l + 8]); - m[l + 8] = vminq_f32(six, m[l + 8]); - m[l + 12] = vmaxq_f32(zero, m[l + 12]); - m[l + 12] = vminq_f32(six, m[l + 12]); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 4] = MS_MAXQ_F32(zero, m[l + 4]); + m[l + 4] = MS_MINQ_F32(six, m[l + 4]); + m[l + 8] = MS_MAXQ_F32(zero, m[l + 8]); + m[l + 8] = MS_MINQ_F32(six, m[l + 8]); + m[l + 12] = MS_MAXQ_F32(zero, m[l + 12]); + m[l + 12] = MS_MINQ_F32(six, m[l + 12]); } if (r_c == C4NUM && r_h == 4 && r_w == 4) { Store16Data; @@ -2662,41 +2682,42 @@ void OutputTransform8x4Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform8x5Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[40]; - float32x4_t m[25]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[40]; + MS_FLOAT32X4 m[25]; Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), + src[7 + offset]); } for (int l = 0; l < 5; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 5] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 10] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 15] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 20] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), t[7 + offset]), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 5] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 10] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 15] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 20] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), t[7 + offset]), bias_ptr); } if (r_c == C4NUM && r_h == 5 && r_w == 5) { @@ -2760,48 +2781,49 @@ void OutputTransform8x5Unit(const float *src_data, float *dst_data, const float void OutputTransform8x5ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[40]; - float32x4_t m[25]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[40]; + MS_FLOAT32X4 m[25]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), + src[7 + offset]); } for (int l = 0; l < 5; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 5] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 10] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 15] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 20] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), t[7 + offset]), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 5] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 10] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 15] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 20] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 5] = vmaxq_f32(zero, m[l + 5]); - m[l + 10] = vmaxq_f32(zero, m[l + 10]); - m[l + 15] = vmaxq_f32(zero, m[l + 15]); - m[l + 20] = vmaxq_f32(zero, m[l + 20]); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 5] = MS_MAXQ_F32(zero, m[l + 5]); + m[l + 10] = MS_MAXQ_F32(zero, m[l + 10]); + m[l + 15] = MS_MAXQ_F32(zero, m[l + 15]); + m[l + 20] = MS_MAXQ_F32(zero, m[l + 20]); } if (r_c == C4NUM && r_h == 5 && r_w == 5) { Store25Data; @@ -2866,54 +2888,55 @@ void OutputTransform8x5ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform8x5Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[40]; - float32x4_t m[25]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[40]; + MS_FLOAT32X4 m[25]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), + src[7 + offset]); } for (int l = 0; l < 5; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 5] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 10] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 15] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 20] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), t[7 + offset]), + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 5] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 10] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 15] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 20] = MS_ADDQ_F32( + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), t[7 + offset]), bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 5] = vmaxq_f32(zero, m[l + 5]); - m[l + 5] = vminq_f32(six, m[l + 5]); - m[l + 10] = vmaxq_f32(zero, m[l + 10]); - m[l + 10] = vminq_f32(six, m[l + 10]); - m[l + 15] = vmaxq_f32(zero, m[l + 15]); - m[l + 15] = vminq_f32(six, m[l + 15]); - m[l + 20] = vmaxq_f32(zero, m[l + 20]); - m[l + 20] = vminq_f32(six, m[l + 20]); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 5] = MS_MAXQ_F32(zero, m[l + 5]); + m[l + 5] = MS_MINQ_F32(six, m[l + 5]); + m[l + 10] = MS_MAXQ_F32(zero, m[l + 10]); + m[l + 10] = MS_MINQ_F32(six, m[l + 10]); + m[l + 15] = MS_MAXQ_F32(zero, m[l + 15]); + m[l + 15] = MS_MINQ_F32(six, m[l + 15]); + m[l + 20] = MS_MAXQ_F32(zero, m[l + 20]); + m[l + 20] = MS_MINQ_F32(six, m[l + 20]); } if (r_c == C4NUM && r_h == 5 && r_w == 5) { Store25Data; @@ -2979,55 +3002,58 @@ void OutputTransform8x5Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform8x6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[48]; - float32x4_t m[36]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[48]; + MS_FLOAT32X4 m[36]; Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)); - t[l + 40] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)); + t[l + 40] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), + src[7 + offset]); } for (int l = 0; l < 6; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 18] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 24] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), bias_ptr); - m[l + 30] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), t[7 + offset]), - bias_ptr); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 18] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 24] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), bias_ptr); + m[l + 30] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), + t[7 + offset]), + bias_ptr); } if (r_c == C4NUM && r_h == 6 && r_w == 6) { for (int i = 0; i < 6; i++) { int dst_k_offset = i * dst_step * out_c; int m_k_offset = i * 6; - vst1q_f32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); - vst1q_f32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); - vst1q_f32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); - vst1q_f32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); - vst1q_f32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); - vst1q_f32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); + MS_STQ_F32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); + MS_STQ_F32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); + MS_STQ_F32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); + MS_STQ_F32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); + MS_STQ_F32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); + MS_STQ_F32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); } } else { for (int i = 0; i < r_c; i++) { @@ -3092,62 +3118,65 @@ void OutputTransform8x6Unit(const float *src_data, float *dst_data, const float void OutputTransform8x6ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[48]; - float32x4_t m[36]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[48]; + MS_FLOAT32X4 m[36]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)); - t[l + 40] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)); + t[l + 40] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), + src[7 + offset]); } for (int l = 0; l < 6; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 18] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 24] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), bias_ptr); - m[l + 30] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), t[7 + offset]), - bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); - m[l + 12] = vmaxq_f32(zero, m[l + 12]); - m[l + 18] = vmaxq_f32(zero, m[l + 18]); - m[l + 24] = vmaxq_f32(zero, m[l + 24]); - m[l + 30] = vmaxq_f32(zero, m[l + 30]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 18] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 24] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), bias_ptr); + m[l + 30] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), + t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); + m[l + 12] = MS_MAXQ_F32(zero, m[l + 12]); + m[l + 18] = MS_MAXQ_F32(zero, m[l + 18]); + m[l + 24] = MS_MAXQ_F32(zero, m[l + 24]); + m[l + 30] = MS_MAXQ_F32(zero, m[l + 30]); } if (r_c == C4NUM && r_h == 6 && r_w == 6) { for (int i = 0; i < 6; i++) { int dst_k_offset = i * dst_step * out_c; int m_k_offset = i * 6; - vst1q_f32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); - vst1q_f32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); - vst1q_f32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); - vst1q_f32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); - vst1q_f32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); - vst1q_f32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); + MS_STQ_F32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); + MS_STQ_F32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); + MS_STQ_F32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); + MS_STQ_F32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); + MS_STQ_F32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); + MS_STQ_F32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); } } else { for (int i = 0; i < r_c; i++) { @@ -3214,69 +3243,72 @@ void OutputTransform8x6ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform8x6Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[48]; - float32x4_t m[36]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[48]; + MS_FLOAT32X4 m[36]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)); - t[l + 40] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)); + t[l + 40] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), + src[7 + offset]); } for (int l = 0; l < 6; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 6] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 12] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 18] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 24] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), bias_ptr); - m[l + 30] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), t[7 + offset]), - bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 6] = vmaxq_f32(zero, m[l + 6]); - m[l + 6] = vminq_f32(six, m[l + 6]); - m[l + 12] = vmaxq_f32(zero, m[l + 12]); - m[l + 12] = vminq_f32(six, m[l + 12]); - m[l + 18] = vmaxq_f32(zero, m[l + 18]); - m[l + 18] = vminq_f32(six, m[l + 18]); - m[l + 24] = vmaxq_f32(zero, m[l + 24]); - m[l + 24] = vminq_f32(six, m[l + 24]); - m[l + 30] = vmaxq_f32(zero, m[l + 30]); - m[l + 30] = vminq_f32(six, m[l + 30]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 6] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 12] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 18] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 24] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), bias_ptr); + m[l + 30] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), + t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 6] = MS_MAXQ_F32(zero, m[l + 6]); + m[l + 6] = MS_MINQ_F32(six, m[l + 6]); + m[l + 12] = MS_MAXQ_F32(zero, m[l + 12]); + m[l + 12] = MS_MINQ_F32(six, m[l + 12]); + m[l + 18] = MS_MAXQ_F32(zero, m[l + 18]); + m[l + 18] = MS_MINQ_F32(six, m[l + 18]); + m[l + 24] = MS_MAXQ_F32(zero, m[l + 24]); + m[l + 24] = MS_MINQ_F32(six, m[l + 24]); + m[l + 30] = MS_MAXQ_F32(zero, m[l + 30]); + m[l + 30] = MS_MINQ_F32(six, m[l + 30]); } if (r_c == C4NUM && r_h == 6 && r_w == 6) { for (int i = 0; i < 6; i++) { int dst_k_offset = i * dst_step * out_c; int m_k_offset = i * 6; - vst1q_f32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); - vst1q_f32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); - vst1q_f32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); - vst1q_f32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); - vst1q_f32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); - vst1q_f32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); + MS_STQ_F32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); + MS_STQ_F32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); + MS_STQ_F32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); + MS_STQ_F32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); + MS_STQ_F32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); + MS_STQ_F32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); } } else { for (int i = 0; i < r_c; i++) { @@ -3344,58 +3376,62 @@ void OutputTransform8x6Relu6Unit(const float *src_data, float *dst_data, const f void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[56]; - float32x4_t m[49]; +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[56]; + MS_FLOAT32X4 m[49]; Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)); - t[l + 40] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)); - t[l + 48] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.015625), tmp2), vmulq_n_f32(tmp3, 11.390625)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)); + t[l + 40] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)); + t[l + 48] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.015625), tmp2), MS_MULQ_F32(tmp3, 11.390625)), + src[7 + offset]); } for (int l = 0; l < 7; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 7] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 14] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 21] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 28] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), bias_ptr); - m[l + 35] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), bias_ptr); - m[l + 42] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.015625), tmp2), vmulq_n_f32(tmp3, 11.390625)), t[7 + offset]), - bias_ptr); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 7] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 14] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 21] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 28] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), bias_ptr); + m[l + 35] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), bias_ptr); + m[l + 42] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.015625), tmp2), MS_MULQ_F32(tmp3, 11.390625)), + t[7 + offset]), + bias_ptr); } if (r_c == C4NUM && r_h == 7 && r_w == 7) { for (int i = 0; i < 7; i++) { int dst_k_offset = i * dst_step * out_c; int m_k_offset = i * 7; - vst1q_f32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); - vst1q_f32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); - vst1q_f32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); - vst1q_f32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); - vst1q_f32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); - vst1q_f32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); - vst1q_f32(dst_data + dst_k_offset + 6 * out_c, m[m_k_offset + 6]); + MS_STQ_F32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); + MS_STQ_F32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); + MS_STQ_F32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); + MS_STQ_F32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); + MS_STQ_F32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); + MS_STQ_F32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); + MS_STQ_F32(dst_data + dst_k_offset + 6 * out_c, m[m_k_offset + 6]); } } else { for (int i = 0; i < r_c; i++) { @@ -3464,66 +3500,70 @@ void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float void OutputTransform8x7ReluUnit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[56]; - float32x4_t m[49]; - float32x4_t zero = vdupq_n_f32(0); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[56]; + MS_FLOAT32X4 m[49]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)); - t[l + 40] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)); - t[l + 48] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.015625), tmp2), vmulq_n_f32(tmp3, 11.390625)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)); + t[l + 40] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)); + t[l + 48] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.015625), tmp2), MS_MULQ_F32(tmp3, 11.390625)), + src[7 + offset]); } for (int l = 0; l < 7; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 7] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 14] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 21] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 28] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), bias_ptr); - m[l + 35] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), bias_ptr); - m[l + 42] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.015625), tmp2), vmulq_n_f32(tmp3, 11.390625)), t[7 + offset]), - bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l + 7] = vmaxq_f32(zero, m[l + 7]); - m[l + 14] = vmaxq_f32(zero, m[l + 14]); - m[l + 21] = vmaxq_f32(zero, m[l + 21]); - m[l + 28] = vmaxq_f32(zero, m[l + 28]); - m[l + 35] = vmaxq_f32(zero, m[l + 35]); - m[l + 42] = vmaxq_f32(zero, m[l + 42]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 7] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 14] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 21] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 28] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), bias_ptr); + m[l + 35] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), bias_ptr); + m[l + 42] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.015625), tmp2), MS_MULQ_F32(tmp3, 11.390625)), + t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l + 7] = MS_MAXQ_F32(zero, m[l + 7]); + m[l + 14] = MS_MAXQ_F32(zero, m[l + 14]); + m[l + 21] = MS_MAXQ_F32(zero, m[l + 21]); + m[l + 28] = MS_MAXQ_F32(zero, m[l + 28]); + m[l + 35] = MS_MAXQ_F32(zero, m[l + 35]); + m[l + 42] = MS_MAXQ_F32(zero, m[l + 42]); } if (r_c == C4NUM && r_h == 7 && r_w == 7) { for (int i = 0; i < 7; i++) { int dst_k_offset = i * dst_step * out_c; int m_k_offset = i * 7; - vst1q_f32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); - vst1q_f32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); - vst1q_f32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); - vst1q_f32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); - vst1q_f32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); - vst1q_f32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); - vst1q_f32(dst_data + dst_k_offset + 6 * out_c, m[m_k_offset + 6]); + MS_STQ_F32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); + MS_STQ_F32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); + MS_STQ_F32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); + MS_STQ_F32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); + MS_STQ_F32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); + MS_STQ_F32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); + MS_STQ_F32(dst_data + dst_k_offset + 6 * out_c, m[m_k_offset + 6]); } } else { for (int i = 0; i < r_c; i++) { @@ -3594,74 +3634,78 @@ void OutputTransform8x7ReluUnit(const float *src_data, float *dst_data, const fl void OutputTransform8x7Relu6Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c) { -#ifdef ENABLE_ARM - float32x4_t src[64]; - float32x4_t t[56]; - float32x4_t m[49]; - float32x4_t zero = vdupq_n_f32(0); - float32x4_t six = vdupq_n_f32(6); +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) + MS_FLOAT32X4 src[64]; + MS_FLOAT32X4 t[56]; + MS_FLOAT32X4 m[49]; + MS_FLOAT32X4 zero = MS_MOVQ_F32(0); + MS_FLOAT32X4 six = MS_MOVQ_F32(6); Load64Data; - float32x4_t bias_ptr = vld1q_f32(bias_data); + MS_FLOAT32X4 bias_ptr = MS_LDQ_F32(bias_data); for (int l = 0; l < 8; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp2 = vaddq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp3 = vaddq_f32(src[5 + offset], src[6 + offset]); - float32x4_t tmp4 = vsubq_f32(src[1 + offset], src[2 + offset]); - float32x4_t tmp5 = vsubq_f32(src[3 + offset], src[4 + offset]); - float32x4_t tmp6 = vsubq_f32(src[5 + offset], src[6 + offset]); - t[l] = vaddq_f32(vaddq_f32(vaddq_f32(src[offset], tmp1), tmp2), tmp3); - t[l + 8] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)); - t[l + 16] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)); - t[l + 24] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)); - t[l + 32] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)); - t[l + 40] = vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)); - t[l + 48] = - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.015625), tmp2), vmulq_n_f32(tmp3, 11.390625)), src[7 + offset]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(src[5 + offset], src[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(src[1 + offset], src[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(src[3 + offset], src[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(src[5 + offset], src[6 + offset]); + t[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(src[offset], tmp1), tmp2), tmp3); + t[l + 8] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)); + t[l + 16] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)); + t[l + 24] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)); + t[l + 32] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)); + t[l + 40] = MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)); + t[l + 48] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.015625), tmp2), MS_MULQ_F32(tmp3, 11.390625)), + src[7 + offset]); } for (int l = 0; l < 7; ++l) { int offset = l * 8; - float32x4_t tmp1 = vaddq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp2 = vaddq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp3 = vaddq_f32(t[5 + offset], t[6 + offset]); - float32x4_t tmp4 = vsubq_f32(t[1 + offset], t[2 + offset]); - float32x4_t tmp5 = vsubq_f32(t[3 + offset], t[4 + offset]); - float32x4_t tmp6 = vsubq_f32(t[5 + offset], t[6 + offset]); - m[l] = vaddq_f32(vaddq_f32(vaddq_f32(vaddq_f32(t[offset], tmp1), tmp2), tmp3), bias_ptr); - m[l + 7] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.5), tmp5), vmulq_n_f32(tmp6, 1.5)), bias_ptr); - m[l + 14] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.25), tmp2), vmulq_n_f32(tmp3, 2.25)), bias_ptr); - m[l + 21] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.125), tmp5), vmulq_n_f32(tmp6, 3.375)), bias_ptr); - m[l + 28] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.0625), tmp2), vmulq_n_f32(tmp3, 5.0625)), bias_ptr); - m[l + 35] = vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp4, 0.03125), tmp5), vmulq_n_f32(tmp6, 7.59375)), bias_ptr); - m[l + 42] = vaddq_f32( - vaddq_f32(vaddq_f32(vaddq_f32(vmulq_n_f32(tmp1, 0.015625), tmp2), vmulq_n_f32(tmp3, 11.390625)), t[7 + offset]), - bias_ptr); - m[l] = vmaxq_f32(zero, m[l]); - m[l] = vminq_f32(six, m[l]); - m[l + 7] = vmaxq_f32(zero, m[l + 7]); - m[l + 7] = vminq_f32(six, m[l + 7]); - m[l + 14] = vmaxq_f32(zero, m[l + 14]); - m[l + 14] = vminq_f32(six, m[l + 14]); - m[l + 21] = vmaxq_f32(zero, m[l + 21]); - m[l + 21] = vminq_f32(six, m[l + 21]); - m[l + 28] = vmaxq_f32(zero, m[l + 28]); - m[l + 28] = vminq_f32(six, m[l + 28]); - m[l + 35] = vmaxq_f32(zero, m[l + 35]); - m[l + 35] = vminq_f32(six, m[l + 35]); - m[l + 42] = vmaxq_f32(zero, m[l + 42]); - m[l + 42] = vminq_f32(six, m[l + 42]); + MS_FLOAT32X4 tmp1 = MS_ADDQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp2 = MS_ADDQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp3 = MS_ADDQ_F32(t[5 + offset], t[6 + offset]); + MS_FLOAT32X4 tmp4 = MS_SUBQ_F32(t[1 + offset], t[2 + offset]); + MS_FLOAT32X4 tmp5 = MS_SUBQ_F32(t[3 + offset], t[4 + offset]); + MS_FLOAT32X4 tmp6 = MS_SUBQ_F32(t[5 + offset], t[6 + offset]); + m[l] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(t[offset], tmp1), tmp2), tmp3), bias_ptr); + m[l + 7] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.5), tmp5), MS_MULQ_F32(tmp6, 1.5)), bias_ptr); + m[l + 14] = MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.25), tmp2), MS_MULQ_F32(tmp3, 2.25)), bias_ptr); + m[l + 21] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.125), tmp5), MS_MULQ_F32(tmp6, 3.375)), bias_ptr); + m[l + 28] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.0625), tmp2), MS_MULQ_F32(tmp3, 5.0625)), bias_ptr); + m[l + 35] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp4, 0.03125), tmp5), MS_MULQ_F32(tmp6, 7.59375)), bias_ptr); + m[l + 42] = + MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_ADDQ_F32(MS_MULQ_F32(tmp1, 0.015625), tmp2), MS_MULQ_F32(tmp3, 11.390625)), + t[7 + offset]), + bias_ptr); + m[l] = MS_MAXQ_F32(zero, m[l]); + m[l] = MS_MINQ_F32(six, m[l]); + m[l + 7] = MS_MAXQ_F32(zero, m[l + 7]); + m[l + 7] = MS_MINQ_F32(six, m[l + 7]); + m[l + 14] = MS_MAXQ_F32(zero, m[l + 14]); + m[l + 14] = MS_MINQ_F32(six, m[l + 14]); + m[l + 21] = MS_MAXQ_F32(zero, m[l + 21]); + m[l + 21] = MS_MINQ_F32(six, m[l + 21]); + m[l + 28] = MS_MAXQ_F32(zero, m[l + 28]); + m[l + 28] = MS_MINQ_F32(six, m[l + 28]); + m[l + 35] = MS_MAXQ_F32(zero, m[l + 35]); + m[l + 35] = MS_MINQ_F32(six, m[l + 35]); + m[l + 42] = MS_MAXQ_F32(zero, m[l + 42]); + m[l + 42] = MS_MINQ_F32(six, m[l + 42]); } if (r_c == C4NUM && r_h == 7 && r_w == 7) { for (int i = 0; i < 7; i++) { int dst_k_offset = i * dst_step * out_c; int m_k_offset = i * 7; - vst1q_f32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); - vst1q_f32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); - vst1q_f32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); - vst1q_f32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); - vst1q_f32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); - vst1q_f32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); - vst1q_f32(dst_data + dst_k_offset + 6 * out_c, m[m_k_offset + 6]); + MS_STQ_F32(dst_data + dst_k_offset + 0 * out_c, m[m_k_offset]); + MS_STQ_F32(dst_data + dst_k_offset + 1 * out_c, m[m_k_offset + 1]); + MS_STQ_F32(dst_data + dst_k_offset + 2 * out_c, m[m_k_offset + 2]); + MS_STQ_F32(dst_data + dst_k_offset + 3 * out_c, m[m_k_offset + 3]); + MS_STQ_F32(dst_data + dst_k_offset + 4 * out_c, m[m_k_offset + 4]); + MS_STQ_F32(dst_data + dst_k_offset + 5 * out_c, m[m_k_offset + 5]); + MS_STQ_F32(dst_data + dst_k_offset + 6 * out_c, m[m_k_offset + 6]); } } else { for (int i = 0; i < r_c; i++) { diff --git a/mindspore/lite/nnacl/winograd_utils.h b/mindspore/lite/nnacl/winograd_utils.h index 18b5783f2e..34f170a753 100644 --- a/mindspore/lite/nnacl/winograd_utils.h +++ b/mindspore/lite/nnacl/winograd_utils.h @@ -39,127 +39,127 @@ void GeneralInputTransformUnit(const float *src_data, float *dst_data, const flo void GeneralOutputTransformUnit(const float *src_data, float *dst_data, const float *bias_data, const float *matrix_a, const float *matrix_at, int src_step, int dst_step, int in_unit, int out_unit); -#define Load16Data \ - src[0] = vld1q_f32(src_data + 0 * src_step); \ - src[1] = vld1q_f32(src_data + 1 * src_step); \ - src[2] = vld1q_f32(src_data + 2 * src_step); \ - src[3] = vld1q_f32(src_data + 3 * src_step); \ - src[4] = vld1q_f32(src_data + 4 * src_step); \ - src[5] = vld1q_f32(src_data + 5 * src_step); \ - src[6] = vld1q_f32(src_data + 6 * src_step); \ - src[7] = vld1q_f32(src_data + 7 * src_step); \ - src[8] = vld1q_f32(src_data + 8 * src_step); \ - src[9] = vld1q_f32(src_data + 9 * src_step); \ - src[10] = vld1q_f32(src_data + 10 * src_step); \ - src[11] = vld1q_f32(src_data + 11 * src_step); \ - src[12] = vld1q_f32(src_data + 12 * src_step); \ - src[13] = vld1q_f32(src_data + 13 * src_step); \ - src[14] = vld1q_f32(src_data + 14 * src_step); \ - src[15] = vld1q_f32(src_data + 15 * src_step); +#define Load16Data \ + src[0] = MS_LDQ_F32(src_data + 0 * src_step); \ + src[1] = MS_LDQ_F32(src_data + 1 * src_step); \ + src[2] = MS_LDQ_F32(src_data + 2 * src_step); \ + src[3] = MS_LDQ_F32(src_data + 3 * src_step); \ + src[4] = MS_LDQ_F32(src_data + 4 * src_step); \ + src[5] = MS_LDQ_F32(src_data + 5 * src_step); \ + src[6] = MS_LDQ_F32(src_data + 6 * src_step); \ + src[7] = MS_LDQ_F32(src_data + 7 * src_step); \ + src[8] = MS_LDQ_F32(src_data + 8 * src_step); \ + src[9] = MS_LDQ_F32(src_data + 9 * src_step); \ + src[10] = MS_LDQ_F32(src_data + 10 * src_step); \ + src[11] = MS_LDQ_F32(src_data + 11 * src_step); \ + src[12] = MS_LDQ_F32(src_data + 12 * src_step); \ + src[13] = MS_LDQ_F32(src_data + 13 * src_step); \ + src[14] = MS_LDQ_F32(src_data + 14 * src_step); \ + src[15] = MS_LDQ_F32(src_data + 15 * src_step); -#define Load36Data \ - src[0] = vld1q_f32(src_data + 0 * src_step); \ - src[1] = vld1q_f32(src_data + 1 * src_step); \ - src[2] = vld1q_f32(src_data + 2 * src_step); \ - src[3] = vld1q_f32(src_data + 3 * src_step); \ - src[4] = vld1q_f32(src_data + 4 * src_step); \ - src[5] = vld1q_f32(src_data + 5 * src_step); \ - src[6] = vld1q_f32(src_data + 6 * src_step); \ - src[7] = vld1q_f32(src_data + 7 * src_step); \ - src[8] = vld1q_f32(src_data + 8 * src_step); \ - src[9] = vld1q_f32(src_data + 9 * src_step); \ - src[10] = vld1q_f32(src_data + 10 * src_step); \ - src[11] = vld1q_f32(src_data + 11 * src_step); \ - src[12] = vld1q_f32(src_data + 12 * src_step); \ - src[13] = vld1q_f32(src_data + 13 * src_step); \ - src[14] = vld1q_f32(src_data + 14 * src_step); \ - src[15] = vld1q_f32(src_data + 15 * src_step); \ - src[16] = vld1q_f32(src_data + 16 * src_step); \ - src[17] = vld1q_f32(src_data + 17 * src_step); \ - src[18] = vld1q_f32(src_data + 18 * src_step); \ - src[19] = vld1q_f32(src_data + 19 * src_step); \ - src[20] = vld1q_f32(src_data + 20 * src_step); \ - src[21] = vld1q_f32(src_data + 21 * src_step); \ - src[22] = vld1q_f32(src_data + 22 * src_step); \ - src[23] = vld1q_f32(src_data + 23 * src_step); \ - src[24] = vld1q_f32(src_data + 24 * src_step); \ - src[25] = vld1q_f32(src_data + 25 * src_step); \ - src[26] = vld1q_f32(src_data + 26 * src_step); \ - src[27] = vld1q_f32(src_data + 27 * src_step); \ - src[28] = vld1q_f32(src_data + 28 * src_step); \ - src[29] = vld1q_f32(src_data + 29 * src_step); \ - src[30] = vld1q_f32(src_data + 30 * src_step); \ - src[31] = vld1q_f32(src_data + 31 * src_step); \ - src[32] = vld1q_f32(src_data + 32 * src_step); \ - src[33] = vld1q_f32(src_data + 33 * src_step); \ - src[34] = vld1q_f32(src_data + 34 * src_step); \ - src[35] = vld1q_f32(src_data + 35 * src_step); +#define Load36Data \ + src[0] = MS_LDQ_F32(src_data + 0 * src_step); \ + src[1] = MS_LDQ_F32(src_data + 1 * src_step); \ + src[2] = MS_LDQ_F32(src_data + 2 * src_step); \ + src[3] = MS_LDQ_F32(src_data + 3 * src_step); \ + src[4] = MS_LDQ_F32(src_data + 4 * src_step); \ + src[5] = MS_LDQ_F32(src_data + 5 * src_step); \ + src[6] = MS_LDQ_F32(src_data + 6 * src_step); \ + src[7] = MS_LDQ_F32(src_data + 7 * src_step); \ + src[8] = MS_LDQ_F32(src_data + 8 * src_step); \ + src[9] = MS_LDQ_F32(src_data + 9 * src_step); \ + src[10] = MS_LDQ_F32(src_data + 10 * src_step); \ + src[11] = MS_LDQ_F32(src_data + 11 * src_step); \ + src[12] = MS_LDQ_F32(src_data + 12 * src_step); \ + src[13] = MS_LDQ_F32(src_data + 13 * src_step); \ + src[14] = MS_LDQ_F32(src_data + 14 * src_step); \ + src[15] = MS_LDQ_F32(src_data + 15 * src_step); \ + src[16] = MS_LDQ_F32(src_data + 16 * src_step); \ + src[17] = MS_LDQ_F32(src_data + 17 * src_step); \ + src[18] = MS_LDQ_F32(src_data + 18 * src_step); \ + src[19] = MS_LDQ_F32(src_data + 19 * src_step); \ + src[20] = MS_LDQ_F32(src_data + 20 * src_step); \ + src[21] = MS_LDQ_F32(src_data + 21 * src_step); \ + src[22] = MS_LDQ_F32(src_data + 22 * src_step); \ + src[23] = MS_LDQ_F32(src_data + 23 * src_step); \ + src[24] = MS_LDQ_F32(src_data + 24 * src_step); \ + src[25] = MS_LDQ_F32(src_data + 25 * src_step); \ + src[26] = MS_LDQ_F32(src_data + 26 * src_step); \ + src[27] = MS_LDQ_F32(src_data + 27 * src_step); \ + src[28] = MS_LDQ_F32(src_data + 28 * src_step); \ + src[29] = MS_LDQ_F32(src_data + 29 * src_step); \ + src[30] = MS_LDQ_F32(src_data + 30 * src_step); \ + src[31] = MS_LDQ_F32(src_data + 31 * src_step); \ + src[32] = MS_LDQ_F32(src_data + 32 * src_step); \ + src[33] = MS_LDQ_F32(src_data + 33 * src_step); \ + src[34] = MS_LDQ_F32(src_data + 34 * src_step); \ + src[35] = MS_LDQ_F32(src_data + 35 * src_step); -#define Load64Data \ - src[0] = vld1q_f32(src_data + 0 * src_step); \ - src[1] = vld1q_f32(src_data + 1 * src_step); \ - src[2] = vld1q_f32(src_data + 2 * src_step); \ - src[3] = vld1q_f32(src_data + 3 * src_step); \ - src[4] = vld1q_f32(src_data + 4 * src_step); \ - src[5] = vld1q_f32(src_data + 5 * src_step); \ - src[6] = vld1q_f32(src_data + 6 * src_step); \ - src[7] = vld1q_f32(src_data + 7 * src_step); \ - src[8] = vld1q_f32(src_data + 8 * src_step); \ - src[9] = vld1q_f32(src_data + 9 * src_step); \ - src[10] = vld1q_f32(src_data + 10 * src_step); \ - src[11] = vld1q_f32(src_data + 11 * src_step); \ - src[12] = vld1q_f32(src_data + 12 * src_step); \ - src[13] = vld1q_f32(src_data + 13 * src_step); \ - src[14] = vld1q_f32(src_data + 14 * src_step); \ - src[15] = vld1q_f32(src_data + 15 * src_step); \ - src[16] = vld1q_f32(src_data + 16 * src_step); \ - src[17] = vld1q_f32(src_data + 17 * src_step); \ - src[18] = vld1q_f32(src_data + 18 * src_step); \ - src[19] = vld1q_f32(src_data + 19 * src_step); \ - src[20] = vld1q_f32(src_data + 20 * src_step); \ - src[21] = vld1q_f32(src_data + 21 * src_step); \ - src[22] = vld1q_f32(src_data + 22 * src_step); \ - src[23] = vld1q_f32(src_data + 23 * src_step); \ - src[24] = vld1q_f32(src_data + 24 * src_step); \ - src[25] = vld1q_f32(src_data + 25 * src_step); \ - src[26] = vld1q_f32(src_data + 26 * src_step); \ - src[27] = vld1q_f32(src_data + 27 * src_step); \ - src[28] = vld1q_f32(src_data + 28 * src_step); \ - src[29] = vld1q_f32(src_data + 29 * src_step); \ - src[30] = vld1q_f32(src_data + 30 * src_step); \ - src[31] = vld1q_f32(src_data + 31 * src_step); \ - src[32] = vld1q_f32(src_data + 32 * src_step); \ - src[33] = vld1q_f32(src_data + 33 * src_step); \ - src[34] = vld1q_f32(src_data + 34 * src_step); \ - src[35] = vld1q_f32(src_data + 35 * src_step); \ - src[36] = vld1q_f32(src_data + 36 * src_step); \ - src[37] = vld1q_f32(src_data + 37 * src_step); \ - src[38] = vld1q_f32(src_data + 38 * src_step); \ - src[39] = vld1q_f32(src_data + 39 * src_step); \ - src[40] = vld1q_f32(src_data + 40 * src_step); \ - src[41] = vld1q_f32(src_data + 41 * src_step); \ - src[42] = vld1q_f32(src_data + 42 * src_step); \ - src[43] = vld1q_f32(src_data + 43 * src_step); \ - src[44] = vld1q_f32(src_data + 44 * src_step); \ - src[45] = vld1q_f32(src_data + 45 * src_step); \ - src[46] = vld1q_f32(src_data + 46 * src_step); \ - src[47] = vld1q_f32(src_data + 47 * src_step); \ - src[48] = vld1q_f32(src_data + 48 * src_step); \ - src[49] = vld1q_f32(src_data + 49 * src_step); \ - src[50] = vld1q_f32(src_data + 50 * src_step); \ - src[51] = vld1q_f32(src_data + 51 * src_step); \ - src[52] = vld1q_f32(src_data + 52 * src_step); \ - src[53] = vld1q_f32(src_data + 53 * src_step); \ - src[54] = vld1q_f32(src_data + 54 * src_step); \ - src[55] = vld1q_f32(src_data + 55 * src_step); \ - src[56] = vld1q_f32(src_data + 56 * src_step); \ - src[57] = vld1q_f32(src_data + 57 * src_step); \ - src[58] = vld1q_f32(src_data + 58 * src_step); \ - src[59] = vld1q_f32(src_data + 59 * src_step); \ - src[60] = vld1q_f32(src_data + 60 * src_step); \ - src[61] = vld1q_f32(src_data + 61 * src_step); \ - src[62] = vld1q_f32(src_data + 62 * src_step); \ - src[63] = vld1q_f32(src_data + 63 * src_step); +#define Load64Data \ + src[0] = MS_LDQ_F32(src_data + 0 * src_step); \ + src[1] = MS_LDQ_F32(src_data + 1 * src_step); \ + src[2] = MS_LDQ_F32(src_data + 2 * src_step); \ + src[3] = MS_LDQ_F32(src_data + 3 * src_step); \ + src[4] = MS_LDQ_F32(src_data + 4 * src_step); \ + src[5] = MS_LDQ_F32(src_data + 5 * src_step); \ + src[6] = MS_LDQ_F32(src_data + 6 * src_step); \ + src[7] = MS_LDQ_F32(src_data + 7 * src_step); \ + src[8] = MS_LDQ_F32(src_data + 8 * src_step); \ + src[9] = MS_LDQ_F32(src_data + 9 * src_step); \ + src[10] = MS_LDQ_F32(src_data + 10 * src_step); \ + src[11] = MS_LDQ_F32(src_data + 11 * src_step); \ + src[12] = MS_LDQ_F32(src_data + 12 * src_step); \ + src[13] = MS_LDQ_F32(src_data + 13 * src_step); \ + src[14] = MS_LDQ_F32(src_data + 14 * src_step); \ + src[15] = MS_LDQ_F32(src_data + 15 * src_step); \ + src[16] = MS_LDQ_F32(src_data + 16 * src_step); \ + src[17] = MS_LDQ_F32(src_data + 17 * src_step); \ + src[18] = MS_LDQ_F32(src_data + 18 * src_step); \ + src[19] = MS_LDQ_F32(src_data + 19 * src_step); \ + src[20] = MS_LDQ_F32(src_data + 20 * src_step); \ + src[21] = MS_LDQ_F32(src_data + 21 * src_step); \ + src[22] = MS_LDQ_F32(src_data + 22 * src_step); \ + src[23] = MS_LDQ_F32(src_data + 23 * src_step); \ + src[24] = MS_LDQ_F32(src_data + 24 * src_step); \ + src[25] = MS_LDQ_F32(src_data + 25 * src_step); \ + src[26] = MS_LDQ_F32(src_data + 26 * src_step); \ + src[27] = MS_LDQ_F32(src_data + 27 * src_step); \ + src[28] = MS_LDQ_F32(src_data + 28 * src_step); \ + src[29] = MS_LDQ_F32(src_data + 29 * src_step); \ + src[30] = MS_LDQ_F32(src_data + 30 * src_step); \ + src[31] = MS_LDQ_F32(src_data + 31 * src_step); \ + src[32] = MS_LDQ_F32(src_data + 32 * src_step); \ + src[33] = MS_LDQ_F32(src_data + 33 * src_step); \ + src[34] = MS_LDQ_F32(src_data + 34 * src_step); \ + src[35] = MS_LDQ_F32(src_data + 35 * src_step); \ + src[36] = MS_LDQ_F32(src_data + 36 * src_step); \ + src[37] = MS_LDQ_F32(src_data + 37 * src_step); \ + src[38] = MS_LDQ_F32(src_data + 38 * src_step); \ + src[39] = MS_LDQ_F32(src_data + 39 * src_step); \ + src[40] = MS_LDQ_F32(src_data + 40 * src_step); \ + src[41] = MS_LDQ_F32(src_data + 41 * src_step); \ + src[42] = MS_LDQ_F32(src_data + 42 * src_step); \ + src[43] = MS_LDQ_F32(src_data + 43 * src_step); \ + src[44] = MS_LDQ_F32(src_data + 44 * src_step); \ + src[45] = MS_LDQ_F32(src_data + 45 * src_step); \ + src[46] = MS_LDQ_F32(src_data + 46 * src_step); \ + src[47] = MS_LDQ_F32(src_data + 47 * src_step); \ + src[48] = MS_LDQ_F32(src_data + 48 * src_step); \ + src[49] = MS_LDQ_F32(src_data + 49 * src_step); \ + src[50] = MS_LDQ_F32(src_data + 50 * src_step); \ + src[51] = MS_LDQ_F32(src_data + 51 * src_step); \ + src[52] = MS_LDQ_F32(src_data + 52 * src_step); \ + src[53] = MS_LDQ_F32(src_data + 53 * src_step); \ + src[54] = MS_LDQ_F32(src_data + 54 * src_step); \ + src[55] = MS_LDQ_F32(src_data + 55 * src_step); \ + src[56] = MS_LDQ_F32(src_data + 56 * src_step); \ + src[57] = MS_LDQ_F32(src_data + 57 * src_step); \ + src[58] = MS_LDQ_F32(src_data + 58 * src_step); \ + src[59] = MS_LDQ_F32(src_data + 59 * src_step); \ + src[60] = MS_LDQ_F32(src_data + 60 * src_step); \ + src[61] = MS_LDQ_F32(src_data + 61 * src_step); \ + src[62] = MS_LDQ_F32(src_data + 62 * src_step); \ + src[63] = MS_LDQ_F32(src_data + 63 * src_step); InputTransFunc GetInputTransFunc(int input_unit); @@ -171,67 +171,67 @@ void InputTransform8x8Unit(const float *src_data, float *dst_data, int src_step, OutputTransFunc GetOutputTransFunc(int input_unit, int output_unit, ActType act_type); -#define Store4Data \ - vst1q_f32(dst_data, m[0]); \ - vst1q_f32(dst_data + out_c, m[1]); \ - vst1q_f32(dst_data + dst_step * out_c, m[2]); \ - vst1q_f32(dst_data + dst_step * out_c + out_c, m[3]); +#define Store4Data \ + MS_STQ_F32(dst_data, m[0]); \ + MS_STQ_F32(dst_data + out_c, m[1]); \ + MS_STQ_F32(dst_data + dst_step * out_c, m[2]); \ + MS_STQ_F32(dst_data + dst_step * out_c + out_c, m[3]); -#define Store9Data \ - vst1q_f32(dst_data, m[0]); \ - vst1q_f32(dst_data + out_c, m[1]); \ - vst1q_f32(dst_data + 2 * out_c, m[2]); \ - vst1q_f32(dst_data + dst_step * out_c, m[3]); \ - vst1q_f32(dst_data + dst_step * out_c + out_c, m[4]); \ - vst1q_f32(dst_data + dst_step * out_c + 2 * out_c, m[5]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c, m[6]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + out_c, m[7]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + 2 * out_c, m[8]); +#define Store9Data \ + MS_STQ_F32(dst_data, m[0]); \ + MS_STQ_F32(dst_data + out_c, m[1]); \ + MS_STQ_F32(dst_data + 2 * out_c, m[2]); \ + MS_STQ_F32(dst_data + dst_step * out_c, m[3]); \ + MS_STQ_F32(dst_data + dst_step * out_c + out_c, m[4]); \ + MS_STQ_F32(dst_data + dst_step * out_c + 2 * out_c, m[5]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c, m[6]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + out_c, m[7]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + 2 * out_c, m[8]); -#define Store16Data \ - vst1q_f32(dst_data, m[0]); \ - vst1q_f32(dst_data + out_c, m[1]); \ - vst1q_f32(dst_data + 2 * out_c, m[2]); \ - vst1q_f32(dst_data + 3 * out_c, m[3]); \ - vst1q_f32(dst_data + dst_step * out_c, m[4]); \ - vst1q_f32(dst_data + dst_step * out_c + out_c, m[5]); \ - vst1q_f32(dst_data + dst_step * out_c + 2 * out_c, m[6]); \ - vst1q_f32(dst_data + dst_step * out_c + 3 * out_c, m[7]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c, m[8]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + out_c, m[9]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + 2 * out_c, m[10]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + 3 * out_c, m[11]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c, m[12]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c + out_c, m[13]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c + 2 * out_c, m[14]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c + 3 * out_c, m[15]); +#define Store16Data \ + MS_STQ_F32(dst_data, m[0]); \ + MS_STQ_F32(dst_data + out_c, m[1]); \ + MS_STQ_F32(dst_data + 2 * out_c, m[2]); \ + MS_STQ_F32(dst_data + 3 * out_c, m[3]); \ + MS_STQ_F32(dst_data + dst_step * out_c, m[4]); \ + MS_STQ_F32(dst_data + dst_step * out_c + out_c, m[5]); \ + MS_STQ_F32(dst_data + dst_step * out_c + 2 * out_c, m[6]); \ + MS_STQ_F32(dst_data + dst_step * out_c + 3 * out_c, m[7]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c, m[8]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + out_c, m[9]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + 2 * out_c, m[10]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + 3 * out_c, m[11]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c, m[12]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c + out_c, m[13]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c + 2 * out_c, m[14]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c + 3 * out_c, m[15]); -#define Store25Data \ - vst1q_f32(dst_data, m[0]); \ - vst1q_f32(dst_data + out_c, m[1]); \ - vst1q_f32(dst_data + 2 * out_c, m[2]); \ - vst1q_f32(dst_data + 3 * out_c, m[3]); \ - vst1q_f32(dst_data + 4 * out_c, m[4]); \ - vst1q_f32(dst_data + dst_step * out_c, m[5]); \ - vst1q_f32(dst_data + dst_step * out_c + out_c, m[6]); \ - vst1q_f32(dst_data + dst_step * out_c + 2 * out_c, m[7]); \ - vst1q_f32(dst_data + dst_step * out_c + 3 * out_c, m[8]); \ - vst1q_f32(dst_data + dst_step * out_c + 4 * out_c, m[9]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c, m[10]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + out_c, m[11]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + 2 * out_c, m[12]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + 3 * out_c, m[13]); \ - vst1q_f32(dst_data + 2 * dst_step * out_c + 4 * out_c, m[14]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c, m[15]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c + out_c, m[16]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c + 2 * out_c, m[17]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c + 3 * out_c, m[18]); \ - vst1q_f32(dst_data + 3 * dst_step * out_c + 4 * out_c, m[19]); \ - vst1q_f32(dst_data + 4 * dst_step * out_c, m[20]); \ - vst1q_f32(dst_data + 4 * dst_step * out_c + out_c, m[21]); \ - vst1q_f32(dst_data + 4 * dst_step * out_c + 2 * out_c, m[22]); \ - vst1q_f32(dst_data + 4 * dst_step * out_c + 3 * out_c, m[23]); \ - vst1q_f32(dst_data + 4 * dst_step * out_c + 4 * out_c, m[24]); +#define Store25Data \ + MS_STQ_F32(dst_data, m[0]); \ + MS_STQ_F32(dst_data + out_c, m[1]); \ + MS_STQ_F32(dst_data + 2 * out_c, m[2]); \ + MS_STQ_F32(dst_data + 3 * out_c, m[3]); \ + MS_STQ_F32(dst_data + 4 * out_c, m[4]); \ + MS_STQ_F32(dst_data + dst_step * out_c, m[5]); \ + MS_STQ_F32(dst_data + dst_step * out_c + out_c, m[6]); \ + MS_STQ_F32(dst_data + dst_step * out_c + 2 * out_c, m[7]); \ + MS_STQ_F32(dst_data + dst_step * out_c + 3 * out_c, m[8]); \ + MS_STQ_F32(dst_data + dst_step * out_c + 4 * out_c, m[9]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c, m[10]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + out_c, m[11]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + 2 * out_c, m[12]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + 3 * out_c, m[13]); \ + MS_STQ_F32(dst_data + 2 * dst_step * out_c + 4 * out_c, m[14]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c, m[15]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c + out_c, m[16]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c + 2 * out_c, m[17]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c + 3 * out_c, m[18]); \ + MS_STQ_F32(dst_data + 3 * dst_step * out_c + 4 * out_c, m[19]); \ + MS_STQ_F32(dst_data + 4 * dst_step * out_c, m[20]); \ + MS_STQ_F32(dst_data + 4 * dst_step * out_c + out_c, m[21]); \ + MS_STQ_F32(dst_data + 4 * dst_step * out_c + 2 * out_c, m[22]); \ + MS_STQ_F32(dst_data + 4 * dst_step * out_c + 3 * out_c, m[23]); \ + MS_STQ_F32(dst_data + 4 * dst_step * out_c + 4 * out_c, m[24]); void OutputTransform4x2Unit(const float *src_data, float *dst_data, const float *bias_data, int src_step, int dst_step, int out_c, int r_w, int r_h, int r_c);