diff --git a/mindspore/lite/nnacl/fp32/common_func_fp32.c b/mindspore/lite/nnacl/fp32/common_func_fp32.c index 2ec26b5e98..01914b2a5c 100644 --- a/mindspore/lite/nnacl/fp32/common_func_fp32.c +++ b/mindspore/lite/nnacl/fp32/common_func_fp32.c @@ -56,7 +56,7 @@ void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bi void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, size_t plane_size, size_t plane_stride, size_t relu_type) { -#ifdef ENABLE_ARM +#if defined(ENABLE_ARM) || defined(ENABLE_SSE) size_t oc4mod = output_channel % C4NUM; size_t oc4div = output_channel - oc4mod; size_t stride_size = (plane_stride - plane_size) * C4NUM * sizeof(float); diff --git a/mindspore/lite/nnacl/fp32/common_func_fp32.h b/mindspore/lite/nnacl/fp32/common_func_fp32.h index a6b7c09cb7..898af91d64 100644 --- a/mindspore/lite/nnacl/fp32/common_func_fp32.h +++ b/mindspore/lite/nnacl/fp32/common_func_fp32.h @@ -50,10 +50,6 @@ void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_ size_t in_kh_step, size_t in_kw_step); void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod, size_t plane_size, size_t stride, size_t relu_type); -#endif - -#ifdef ENABLE_ARM - void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, size_t output_channel, size_t input_step); void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod, diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c index 372cefa8e7..83467834bc 100644 --- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c +++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c @@ -21,7 +21,7 @@ #include #endif -#ifndef ENABLE_ARM +#if !defined(ENABLE_ARM) && !defined(ENABLE_SSE) void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels, int output_channel, int input_step) { for (int i = 0; i < num_pixels; i++) { diff --git a/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c b/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c index 1594d09c09..bc305962f9 100644 --- a/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c +++ b/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c @@ -161,7 +161,7 @@ void DeConvWgInputPack(const float *src_ptr, float *dst_ptr, int channel, int st return; } -#ifndef ENABLE_ARM +#if !defined(ENABLE_ARM) && !defined(ENABLE_SSE) void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t cal_num, size_t ic4, size_t oc4) { int dx, sz, dz; const int src_depth_step = 4 * DECONV_WINOGRAD_DEFAULT_TILE; diff --git a/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32Row_sse.c b/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32Row_sse.c new file mode 100644 index 0000000000..ccf1a72395 --- /dev/null +++ b/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32Row_sse.c @@ -0,0 +1,86 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef ENABLE_SSE +#include +#include "nnacl/fp32/common_func_fp32.h" + +void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels, + size_t output_channel, size_t input_step) { + size_t out_c16 = DOWN_DIV(output_channel, C16NUM) * C16NUM; + size_t out_c8 = DOWN_DIV(output_channel, C8NUM) * C8NUM; + size_t out_c4 = DOWN_DIV(output_channel, C4NUM) * C4NUM; + for (int i = 0; i < num_pixels; i++) { + const float *weight_tmp = weight_ptr; + const float *input_tmp = input_ptr; + size_t out_c = 0; + for (; out_c < out_c16; out_c += C16NUM) { + __m128 dst1 = _mm_loadu_ps(output_ptr); + __m128 dst2 = _mm_loadu_ps(output_ptr + 4); + __m128 dst3 = _mm_loadu_ps(output_ptr + 8); + __m128 dst4 = _mm_loadu_ps(output_ptr + 12); + __m128 w1 = _mm_loadu_ps(weight_tmp); + __m128 w2 = _mm_loadu_ps(weight_tmp + 4); + __m128 w3 = _mm_loadu_ps(weight_tmp + 8); + __m128 w4 = _mm_loadu_ps(weight_tmp + 12); + __m128 in1 = _mm_loadu_ps(input_tmp); + __m128 in2 = _mm_loadu_ps(input_tmp + 4); + __m128 in3 = _mm_loadu_ps(input_tmp + 8); + __m128 in4 = _mm_loadu_ps(input_tmp + 12); + dst1 = MS_MLAQ_F32(dst1, w1, in1); + dst2 = MS_MLAQ_F32(dst2, w2, in2); + dst3 = MS_MLAQ_F32(dst3, w3, in3); + dst4 = MS_MLAQ_F32(dst4, w4, in4); + _mm_storeu_ps(output_ptr, dst1); + _mm_storeu_ps(output_ptr + 4, dst2); + _mm_storeu_ps(output_ptr + 8, dst3); + _mm_storeu_ps(output_ptr + 12, dst4); + output_ptr += 16; + input_tmp += 16; + weight_tmp += 16; + } + for (; out_c < out_c8; out_c += C8NUM) { + __m128 dst1 = _mm_loadu_ps(output_ptr); + __m128 dst2 = _mm_loadu_ps(output_ptr + 4); + __m128 w1 = _mm_loadu_ps(weight_tmp); + __m128 w2 = _mm_loadu_ps(weight_tmp + 4); + __m128 in1 = _mm_loadu_ps(input_tmp); + __m128 in2 = _mm_loadu_ps(input_tmp + 4); + dst1 = MS_MLAQ_F32(dst1, w1, in1); + dst2 = MS_MLAQ_F32(dst2, w2, in2); + _mm_storeu_ps(output_ptr, dst1); + _mm_storeu_ps(output_ptr + 4, dst2); + output_ptr += 8; + input_tmp += 8; + weight_tmp += 8; + } + for (; out_c < out_c4; out_c += C4NUM) { + __m128 dst1 = _mm_loadu_ps(output_ptr); + __m128 w1 = _mm_loadu_ps(weight_tmp); + __m128 in1 = _mm_loadu_ps(input_tmp); + dst1 = MS_MLAQ_F32(dst1, w1, in1); + _mm_storeu_ps(output_ptr, dst1); + output_ptr += 4; + input_tmp += 4; + weight_tmp += 4; + } + for (; out_c < output_channel; out_c++) { + *output_ptr++ += weight_ptr[out_c] * input_ptr[out_c]; + } + input_ptr += input_step; + } +} +#endif diff --git a/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC4.c b/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC4.c new file mode 100644 index 0000000000..382a2d2cb4 --- /dev/null +++ b/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC4.c @@ -0,0 +1,126 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef ENABLE_SSE +#include +#include "nnacl/fp32/common_func_fp32.h" + +void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod, + size_t plane_size, size_t plane_stride, size_t relu_type) { + __m128 relu6 = _mm_set_ps1(6.0); + __m128 zero = _mm_setzero_ps(); + size_t stride = oc4div + oc4mod; + plane_stride /= sizeof(float); + for (size_t loop_c4 = 0; loop_c4 < oc4div; loop_c4 += C4NUM) { + size_t plane_size_tmp = plane_size; + float *dst_c4 = dst + loop_c4; + __m128 bias1 = _mm_setzero_ps(); + if (bias != NULL) { + bias1 = _mm_loadu_ps(bias); + bias += 4; + } + for (; plane_size_tmp >= C4NUM; plane_size_tmp -= C4NUM) { + __m128 src1 = _mm_loadu_ps(src); + __m128 src2 = _mm_loadu_ps(src + 4); + __m128 src3 = _mm_loadu_ps(src + 8); + __m128 src4 = _mm_loadu_ps(src + 12); + src += 16; + src1 = _mm_add_ps(src1, bias1); + src2 = _mm_add_ps(src2, bias1); + src3 = _mm_add_ps(src3, bias1); + src4 = _mm_add_ps(src4, bias1); + switch (relu_type) { + case 3: + src1 = _mm_min_ps(src1, relu6); + src2 = _mm_min_ps(src2, relu6); + src3 = _mm_min_ps(src3, relu6); + src4 = _mm_min_ps(src4, relu6); + case 1: + src1 = _mm_max_ps(src1, zero); + src2 = _mm_max_ps(src2, zero); + src3 = _mm_max_ps(src3, zero); + src4 = _mm_max_ps(src4, zero); + break; + } + _mm_storeu_ps(dst_c4, src1); + dst_c4 += stride; + _mm_storeu_ps(dst_c4, src2); + dst_c4 += stride; + _mm_storeu_ps(dst_c4, src3); + dst_c4 += stride; + _mm_storeu_ps(dst_c4, src4); + dst_c4 += stride; + } + for (; plane_size_tmp > 0; plane_size_tmp -= 1) { + __m128 src1 = _mm_loadu_ps(src); + src1 = _mm_add_ps(src1, bias1); + switch (relu_type) { + case 3: + src1 = _mm_min_ps(src1, relu6); + case 1: + src1 = _mm_max_ps(src1, zero); + break; + } + _mm_storeu_ps(dst_c4, src1); + dst_c4 += stride; + src += 4; + } + src += plane_stride; + } + + if (oc4mod == 0) { + return; + } + __m128 bias1 = _mm_setzero_ps(); + if (bias != NULL) { + bias1 = _mm_loadu_ps(bias); + bias += 4; + } + float *dst_c1 = dst + oc4div; + for (size_t plane_size_tmp = plane_size; plane_size_tmp > 0; plane_size_tmp -= 1) { + __m128 src1 = _mm_loadu_ps(src); + src += 4; + src1 = _mm_add_ps(src1, bias1); + switch (relu_type) { + case 3: + src1 = _mm_min_ps(src1, relu6); + case 1: + src1 = _mm_max_ps(src1, zero); + break; + } + switch (oc4mod) { + case 1: + _mm_store_ss(dst_c1, src1); + dst_c1 += stride; + break; + case 2: + _mm_storel_pi((__m64 *)(dst_c1), src1); + dst_c1 += stride; + break; + case 3: + _mm_storel_pi((__m64 *)(dst_c1), src1); + src1 = _mm_unpackhi_ps(src1, src1); + _mm_store_ss(dst_c1 + 2, src1); + dst_c1 += stride; + break; + case 4: + _mm_storeu_ps(dst_c1, src1); + dst_c1 += stride; + break; + } + } +} +#endif diff --git a/mindspore/lite/nnacl/x86_64_sse/PosFuncBiasRelu.c b/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC8.c similarity index 100% rename from mindspore/lite/nnacl/x86_64_sse/PosFuncBiasRelu.c rename to mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC8.c diff --git a/mindspore/lite/nnacl/x86_64_sse/TiledC4MatMulFp32.c b/mindspore/lite/nnacl/x86_64_sse/TiledC4MatMulFp32.c new file mode 100644 index 0000000000..2db1768ce9 --- /dev/null +++ b/mindspore/lite/nnacl/x86_64_sse/TiledC4MatMulFp32.c @@ -0,0 +1,175 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifdef ENABLE_SSE +#include +#include "nnacl/fp32/common_func_fp32.h" +void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t cal_num, size_t ic4, size_t oc4) { + const float *src_tmp = src; + for (int i = 0; i < oc4; ++i) { + float *dst_tmp = dst; + src = src_tmp; + size_t ic4_tmp = ic4 - 1; + __m128 src1 = _mm_loadu_ps(src); + __m128 src2 = _mm_loadu_ps(src + 4); + __m128 src3 = _mm_loadu_ps(src + 8); + __m128 src4 = _mm_loadu_ps(src + 12); + src += 16; + __m128 weight_data[4]; + weight_data[0] = _mm_loadu_ps(weight); + weight_data[1] = _mm_loadu_ps(weight + 4); + weight_data[2] = _mm_loadu_ps(weight + 8); + weight_data[3] = _mm_loadu_ps(weight + 12); + weight += 16; + __m128 dst1 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0])); + __m128 dst2 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0])); + __m128 dst3 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0])); + __m128 dst4 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0])); + for (int j = 1; j < 4; ++j) { + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[j], _mm_set_ps1(src1[j]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[j], _mm_set_ps1(src2[j]))); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[j], _mm_set_ps1(src3[j]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[j], _mm_set_ps1(src4[j]))); + } + src1 = _mm_loadu_ps(src); + src2 = _mm_loadu_ps(src + 4); + src3 = _mm_loadu_ps(src + 8); + src4 = _mm_loadu_ps(src + 12); + src += 16; + __m128 dst5 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0])); + __m128 dst6 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0])); + __m128 dst7 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0])); + __m128 dst8 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0])); + for (int j = 1; j < 4; ++j) { + dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[j], _mm_set_ps1(src1[j]))); + dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[j], _mm_set_ps1(src2[j]))); + dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[j], _mm_set_ps1(src3[j]))); + dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[j], _mm_set_ps1(src4[j]))); + } + if (ic4_tmp != 0) { + ic4_tmp -= 1; + src1 = _mm_loadu_ps(src); + src2 = _mm_loadu_ps(src + 4); + src3 = _mm_loadu_ps(src + 8); + src4 = _mm_loadu_ps(src + 12); + src += 16; + weight_data[0] = _mm_loadu_ps(weight); + weight_data[1] = _mm_loadu_ps(weight + 4); + weight += 8; + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0]))); + for (; ic4_tmp != 0; ic4_tmp -= 1) { + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0]))); + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[1], _mm_set_ps1(src1[1]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[1], _mm_set_ps1(src2[1]))); + weight_data[2] = _mm_loadu_ps(weight); + weight_data[3] = _mm_loadu_ps(weight + 4); + weight += 8; + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[1], _mm_set_ps1(src3[1]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[1], _mm_set_ps1(src4[1]))); + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[2], _mm_set_ps1(src1[2]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[2], _mm_set_ps1(src2[2]))); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[2], _mm_set_ps1(src3[2]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[2], _mm_set_ps1(src4[2]))); + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[3], _mm_set_ps1(src1[3]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[3], _mm_set_ps1(src2[3]))); + src1 = _mm_loadu_ps(src); + src2 = _mm_loadu_ps(src + 4); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[3], _mm_set_ps1(src3[3]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[3], _mm_set_ps1(src4[3]))); + src3 = _mm_loadu_ps(src + 8); + src4 = _mm_loadu_ps(src + 12); + src += 16; + + dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0]))); + dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0]))); + dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0]))); + dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0]))); + + dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[1], _mm_set_ps1(src1[1]))); + dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[1], _mm_set_ps1(src2[1]))); + dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[1], _mm_set_ps1(src3[1]))); + dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[1], _mm_set_ps1(src4[1]))); + + dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[2], _mm_set_ps1(src1[2]))); + dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[2], _mm_set_ps1(src2[2]))); + dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[2], _mm_set_ps1(src3[2]))); + weight_data[0] = _mm_loadu_ps(weight); + weight_data[1] = _mm_loadu_ps(weight + 4); + weight += 8; + dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[2], _mm_set_ps1(src4[2]))); + + dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[3], _mm_set_ps1(src1[3]))); + dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[3], _mm_set_ps1(src2[3]))); + dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[3], _mm_set_ps1(src3[3]))); + src1 = _mm_loadu_ps(src); + src2 = _mm_loadu_ps(src + 4); + dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[3], _mm_set_ps1(src4[3]))); + src3 = _mm_loadu_ps(src + 8); + src4 = _mm_loadu_ps(src + 12); + src += 16; + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0]))); + } + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0]))); + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[1], _mm_set_ps1(src1[1]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[1], _mm_set_ps1(src2[1]))); + weight_data[2] = _mm_loadu_ps(weight); + weight_data[3] = _mm_loadu_ps(weight + 4); + weight += 8; + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[1], _mm_set_ps1(src3[1]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[1], _mm_set_ps1(src4[1]))); + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[2], _mm_set_ps1(src1[2]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[2], _mm_set_ps1(src2[2]))); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[2], _mm_set_ps1(src3[2]))); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[2], _mm_set_ps1(src4[2]))); + + dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[3], _mm_set_ps1(src1[3]))); + dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[3], _mm_set_ps1(src2[3]))); + dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[3], _mm_set_ps1(src3[3]))); + src1 = _mm_loadu_ps(src); + src2 = _mm_loadu_ps(src + 4); + dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[3], _mm_set_ps1(src4[3]))); + src3 = _mm_loadu_ps(src + 8); + src4 = _mm_loadu_ps(src + 12); + src += 16; + for (int j = 0; j < 4; ++j) { + dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[j], _mm_set_ps1(src1[j]))); + dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[j], _mm_set_ps1(src2[j]))); + dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[j], _mm_set_ps1(src3[j]))); + dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[j], _mm_set_ps1(src4[j]))); + } + } + _mm_storeu_ps(dst, dst1); + _mm_storeu_ps(dst + 4, dst2); + _mm_storeu_ps(dst + 8, dst3); + _mm_storeu_ps(dst + 12, dst4); + _mm_storeu_ps(dst + 16, dst5); + _mm_storeu_ps(dst + 20, dst6); + _mm_storeu_ps(dst + 24, dst7); + _mm_storeu_ps(dst + 28, dst8); + dst = dst_tmp + cal_num; + } +} +#endif diff --git a/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc b/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc index dbc4a605e7..3c7f157d30 100644 --- a/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc +++ b/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc @@ -17,7 +17,7 @@ #include "nnacl/tensorlist_parameter.h" #include "src/ops/primitive_c.h" #include "src/ops/populate/populate_register.h" -#include "src/ops/tensorlistfromtensor.h" +#include "src/ops/tensorlist_fromtensor.h" namespace mindspore { namespace lite { diff --git a/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc b/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc index 4ca542724f..18c8b3508a 100644 --- a/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc +++ b/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "src/ops/tensorlistgetitem.h" +#include "src/ops/tensorlist_getitem.h" #include "src/ops/primitive_c.h" #include "src/ops/populate/populate_register.h" #include "nnacl/tensorlist_parameter.h" diff --git a/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc b/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc index 8504f40d3f..76a007cd02 100644 --- a/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc +++ b/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "src/ops/tensorlistreserve.h" +#include "src/ops/tensorlist_reserve.h" #include "src/ops/primitive_c.h" #include "src/ops/populate/populate_register.h" #include "nnacl/tensorlist_parameter.h" diff --git a/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc b/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc index 163d0d9065..ab95a57d32 100644 --- a/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc +++ b/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "src/ops/tensorlistsetitem.h" +#include "src/ops/tensorlist_setitem.h" #include "src/ops/primitive_c.h" #include "src/ops/populate/populate_register.h" #include "nnacl/tensorlist_parameter.h" diff --git a/mindspore/lite/src/ops/populate/tensorliststack_populate.cc b/mindspore/lite/src/ops/populate/tensorliststack_populate.cc index 88bab2207a..a06638ca24 100644 --- a/mindspore/lite/src/ops/populate/tensorliststack_populate.cc +++ b/mindspore/lite/src/ops/populate/tensorliststack_populate.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "src/ops/tensorliststack.h" +#include "src/ops/tensorlist_stack.h" #include "src/ops/primitive_c.h" #include "src/ops/populate/populate_register.h" #include "nnacl/tensorlist_parameter.h" diff --git a/mindspore/lite/src/ops/primitive_c.cc b/mindspore/lite/src/ops/primitive_c.cc index 822ad1d5ab..3b3de99e8c 100644 --- a/mindspore/lite/src/ops/primitive_c.cc +++ b/mindspore/lite/src/ops/primitive_c.cc @@ -150,11 +150,11 @@ #include "src/ops/unsorted_segment_sum.h" #include "src/ops/reciprocal.h" #include "src/ops/constant.h" -#include "src/ops/tensorlistfromtensor.h" -#include "src/ops/tensorlistgetitem.h" -#include "src/ops/tensorlistsetitem.h" -#include "src/ops/tensorlistreserve.h" -#include "src/ops/tensorliststack.h" +#include "src/ops/tensorlist_fromtensor.h" +#include "src/ops/tensorlist_getitem.h" +#include "src/ops/tensorlist_setitem.h" +#include "src/ops/tensorlist_reserve.h" +#include "src/ops/tensorlist_stack.h" #include "src/ops/merge.h" #include "src/ops/switch.h" #include "src/ops/partial.h" diff --git a/mindspore/lite/src/ops/tensorlistfromtensor.cc b/mindspore/lite/src/ops/tensorlist_fromtensor.cc similarity index 98% rename from mindspore/lite/src/ops/tensorlistfromtensor.cc rename to mindspore/lite/src/ops/tensorlist_fromtensor.cc index 490b975cbd..8a389ce3dd 100644 --- a/mindspore/lite/src/ops/tensorlistfromtensor.cc +++ b/mindspore/lite/src/ops/tensorlist_fromtensor.cc @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include "src/ops/tensorlistfromtensor.h" +#include "src/ops/tensorlist_fromtensor.h" #ifndef PRIMITIVE_WRITEABLE #include "src/ops/ops_register.h" @@ -133,7 +133,6 @@ int TensorListFromTensor::InferShape(std::vector inputs_, std::v auto ele_shape_ptr = reinterpret_cast(input1->data_c()); auto output = reinterpret_cast(outputs_[0]); MS_ASSERT(output != nullptr); - // output->set_tensors_data_type(input0->data_type()); std::vector > tensor_shape(dim0, std::vector(input0_shape.begin() + 1, input0_shape.end())); output->set_element_shape(std::vector(ele_shape_ptr, ele_shape_ptr + input1->ElementsNum())); output->set_shape(std::vector(1, dim0)); diff --git a/mindspore/lite/src/ops/tensorlistfromtensor.h b/mindspore/lite/src/ops/tensorlist_fromtensor.h similarity index 100% rename from mindspore/lite/src/ops/tensorlistfromtensor.h rename to mindspore/lite/src/ops/tensorlist_fromtensor.h diff --git a/mindspore/lite/src/ops/tensorlistgetitem.cc b/mindspore/lite/src/ops/tensorlist_getitem.cc similarity index 99% rename from mindspore/lite/src/ops/tensorlistgetitem.cc rename to mindspore/lite/src/ops/tensorlist_getitem.cc index 1f68c49975..065c3e8e90 100644 --- a/mindspore/lite/src/ops/tensorlistgetitem.cc +++ b/mindspore/lite/src/ops/tensorlist_getitem.cc @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include "src/ops/tensorlistgetitem.h" +#include "src/ops/tensorlist_getitem.h" #ifndef PRIMITIVE_WRITEABLE #include "src/ops/ops_register.h" diff --git a/mindspore/lite/src/ops/tensorlistgetitem.h b/mindspore/lite/src/ops/tensorlist_getitem.h similarity index 100% rename from mindspore/lite/src/ops/tensorlistgetitem.h rename to mindspore/lite/src/ops/tensorlist_getitem.h diff --git a/mindspore/lite/src/ops/tensorlistreserve.cc b/mindspore/lite/src/ops/tensorlist_reserve.cc similarity index 99% rename from mindspore/lite/src/ops/tensorlistreserve.cc rename to mindspore/lite/src/ops/tensorlist_reserve.cc index 058ff0d2b4..fe7c0e66a7 100644 --- a/mindspore/lite/src/ops/tensorlistreserve.cc +++ b/mindspore/lite/src/ops/tensorlist_reserve.cc @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include "src/ops/tensorlistreserve.h" +#include "src/ops/tensorlist_reserve.h" #ifndef PRIMITIVE_WRITEABLE #include "src/ops/ops_register.h" diff --git a/mindspore/lite/src/ops/tensorlistreserve.h b/mindspore/lite/src/ops/tensorlist_reserve.h similarity index 100% rename from mindspore/lite/src/ops/tensorlistreserve.h rename to mindspore/lite/src/ops/tensorlist_reserve.h diff --git a/mindspore/lite/src/ops/tensorlistsetitem.cc b/mindspore/lite/src/ops/tensorlist_setitem.cc similarity index 99% rename from mindspore/lite/src/ops/tensorlistsetitem.cc rename to mindspore/lite/src/ops/tensorlist_setitem.cc index 5626a877e2..34969c44ed 100644 --- a/mindspore/lite/src/ops/tensorlistsetitem.cc +++ b/mindspore/lite/src/ops/tensorlist_setitem.cc @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include "src/ops/tensorlistsetitem.h" +#include "src/ops/tensorlist_setitem.h" #ifndef PRIMITIVE_WRITEABLE #include "src/ops/ops_register.h" diff --git a/mindspore/lite/src/ops/tensorlistsetitem.h b/mindspore/lite/src/ops/tensorlist_setitem.h similarity index 100% rename from mindspore/lite/src/ops/tensorlistsetitem.h rename to mindspore/lite/src/ops/tensorlist_setitem.h diff --git a/mindspore/lite/src/ops/tensorliststack.cc b/mindspore/lite/src/ops/tensorlist_stack.cc similarity index 99% rename from mindspore/lite/src/ops/tensorliststack.cc rename to mindspore/lite/src/ops/tensorlist_stack.cc index 00d564f886..3c162e0c2a 100644 --- a/mindspore/lite/src/ops/tensorliststack.cc +++ b/mindspore/lite/src/ops/tensorlist_stack.cc @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include "src/ops/tensorliststack.h" +#include "src/ops/tensorlist_stack.h" #ifndef PRIMITIVE_WRITEABLE #include "src/ops/ops_register.h" diff --git a/mindspore/lite/src/ops/tensorliststack.h b/mindspore/lite/src/ops/tensorlist_stack.h similarity index 100% rename from mindspore/lite/src/ops/tensorliststack.h rename to mindspore/lite/src/ops/tensorlist_stack.h diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.cc similarity index 98% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.cc rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.cc index 5351e69b4f..a2ffd8f78d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.cc @@ -15,7 +15,7 @@ */ #include "include/errorcode.h" #include "src/kernel_registry.h" -#include "src/runtime/kernel/arm/fp32/TensorListFromTensor.h" +#include "src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.h" #include "src/runtime/runtime_api.h" using mindspore::kernel::KERNEL_ARCH::kCPU; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.h similarity index 100% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.h rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.h diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.cc similarity index 98% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.cc rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.cc index 82e1225bda..affa8f7545 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.cc @@ -16,7 +16,7 @@ #include "include/errorcode.h" #include "include/ms_tensor.h" #include "src/kernel_registry.h" -#include "src/runtime/kernel/arm/fp32/TensorListGetItem.h" +#include "src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.h" #include "src/runtime/runtime_api.h" using mindspore::kernel::KERNEL_ARCH::kCPU; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.h similarity index 100% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.h rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.h diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.cc similarity index 97% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.cc rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.cc index 2c958b7650..eb5caeb1ed 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.cc @@ -16,7 +16,7 @@ #include #include "include/errorcode.h" #include "src/kernel_registry.h" -#include "src/runtime/kernel/arm/fp32/TensorListReserve.h" +#include "src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.h" using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::lite::KernelRegistrar; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.h similarity index 100% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.h rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.h diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.cc similarity index 98% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.cc rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.cc index 63a0cfadd1..fcaa460f3e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.cc @@ -16,7 +16,7 @@ #include "include/errorcode.h" #include "include/ms_tensor.h" #include "src/kernel_registry.h" -#include "src/runtime/kernel/arm/fp32/TensorListSetItem.h" +#include "src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.h" #include "src/runtime/runtime_api.h" using mindspore::kernel::KERNEL_ARCH::kCPU; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.h similarity index 100% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.h rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.h diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.cc similarity index 99% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.cc rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.cc index 91ad3f9956..ec04883bc9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.cc @@ -19,7 +19,7 @@ #include "include/errorcode.h" #include "ir/dtype/type_id.h" #include "src/kernel_registry.h" -#include "src/runtime/kernel/arm/fp32/TensorListStack.h" +#include "src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.h" using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::lite::KernelRegistrar; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.h similarity index 100% rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.h rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.h