From: @lzkcode Reviewed-by: @hangangqiang,@zhang_xue_tong Signed-off-by: @zhang_xue_tongtags/v1.1.0
| @@ -68,7 +68,7 @@ void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bi | |||
| return; | |||
| } | |||
| #ifndef ENABLE_ARM | |||
| #if !defined(ENABLE_ARM) && !defined(ENABLE_X86_64_SSE) | |||
| void WinogradTransLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) { | |||
| const int unitStep = 4 * length; | |||
| for (int y = 0; y < h; ++y) { | |||
| @@ -39,6 +39,13 @@ float ShortToFloat32(uint16_t src_value); | |||
| uint16_t Float32ToShort(float src_value); | |||
| #ifdef ENABLE_X86_64_SSE | |||
| void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod, | |||
| size_t plane_size, size_t stride, size_t relu_type); | |||
| void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod, | |||
| size_t plane_size, size_t plane_stride, size_t relu_type); | |||
| #endif | |||
| #ifdef ENABLE_ARM | |||
| void ConvDwFp32Center(float *dst, const float *src, const float *weight, const float *bias, size_t height, size_t width, | |||
| size_t kernel_h, size_t kernel_w, size_t out_h_step, size_t block_channel, size_t in_sh_step, | |||
| @@ -55,10 +55,10 @@ void MatrixMultiplyWinograd(const float *matix_a, const float *matrix_b, float * | |||
| src1_j += in_channel; | |||
| src2_y += n; | |||
| } | |||
| _mm_store_ps(matrix_c, dst1); | |||
| _mm_store_ps(matrix_c + 4, dst2); | |||
| _mm_store_ps(matrix_c + 8, dst3); | |||
| _mm_store_ps(matrix_c + 12, dst4); | |||
| _mm_storeu_ps(matrix_c, dst1); | |||
| _mm_storeu_ps(matrix_c + 4, dst2); | |||
| _mm_storeu_ps(matrix_c + 8, dst3); | |||
| _mm_storeu_ps(matrix_c + 12, dst4); | |||
| src1_j -= in_channel * k; | |||
| src1_j += C16NUM; | |||
| matrix_c += C16NUM; | |||
| @@ -80,8 +80,8 @@ void MatrixMultiplyWinograd(const float *matix_a, const float *matrix_b, float * | |||
| src1_j += in_channel; | |||
| src2_y += n; | |||
| } | |||
| _mm_store_ps(matrix_c, dst1); | |||
| _mm_store_ps(matrix_c + 4, dst2); | |||
| _mm_storeu_ps(matrix_c, dst1); | |||
| _mm_storeu_ps(matrix_c + 4, dst2); | |||
| src1_j -= in_channel * k; | |||
| src1_j += C8NUM; | |||
| matrix_c += C8NUM; | |||
| @@ -185,26 +185,26 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| } | |||
| if (write_mode == 2) { // WriteWino | |||
| c = dst + WinoSteps2; | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_store_ps(dst + 4, dst2); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_storeu_ps(dst + 4, dst2); | |||
| dst += WinoSteps1; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_store_ps(dst + 4, dst4); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_storeu_ps(dst + 4, dst4); | |||
| dst += WinoSteps1; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_store_ps(dst + 4, dst6); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_storeu_ps(dst + 4, dst6); | |||
| dst += WinoSteps1; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_store_ps(dst + 4, dst8); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_storeu_ps(dst + 4, dst8); | |||
| } else if (write_mode == 0) { // WriteC8 | |||
| _mm_store_ps(c, dst1); | |||
| _mm_store_ps(c + 4, dst2); | |||
| _mm_store_ps(c + 8, dst3); | |||
| _mm_store_ps(c + 12, dst4); | |||
| _mm_store_ps(c + 16, dst5); | |||
| _mm_store_ps(c + 20, dst6); | |||
| _mm_store_ps(c + 24, dst7); | |||
| _mm_store_ps(c + 28, dst8); | |||
| _mm_storeu_ps(c, dst1); | |||
| _mm_storeu_ps(c + 4, dst2); | |||
| _mm_storeu_ps(c + 8, dst3); | |||
| _mm_storeu_ps(c + 12, dst4); | |||
| _mm_storeu_ps(c + 16, dst5); | |||
| _mm_storeu_ps(c + 20, dst6); | |||
| _mm_storeu_ps(c + 24, dst7); | |||
| _mm_storeu_ps(c + 28, dst8); | |||
| c += C8Steps; | |||
| } else { | |||
| switch (cc) { | |||
| @@ -288,39 +288,39 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| break; | |||
| case 4: // write4 | |||
| c = dst + 4; | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| dst += stride; | |||
| dst += 4; | |||
| } | |||
| break; | |||
| case 5: // write5 | |||
| c = dst + 5; | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_store_ss(dst + 4, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_store_ss(dst + 4, dst4); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_store_ss(dst + 4, dst6); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_store_ss(dst + 4, dst8); | |||
| dst += stride; | |||
| dst += 5; | |||
| @@ -328,27 +328,27 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| break; | |||
| case 6: // write6 | |||
| c = dst + 6; | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_store_ss(dst + 4, dst2); | |||
| dst2 = _mm_shuffle_ps(dst2, dst2, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_store_ss(dst + 4, dst4); | |||
| dst4 = _mm_shuffle_ps(dst4, dst4, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst4); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_store_ss(dst + 4, dst6); | |||
| dst6 = _mm_shuffle_ps(dst6, dst6, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst6); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_store_ss(dst + 4, dst8); | |||
| dst8 = _mm_shuffle_ps(dst8, dst8, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst8); | |||
| @@ -358,7 +358,7 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| break; | |||
| case 7: // write7 | |||
| c = dst + 7; | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_store_ss(dst + 4, dst2); | |||
| dst2 = _mm_shuffle_ps(dst2, dst2, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst2); | |||
| @@ -366,7 +366,7 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| _mm_store_ss(dst + 6, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_store_ss(dst + 4, dst4); | |||
| dst4 = _mm_shuffle_ps(dst4, dst4, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst4); | |||
| @@ -375,7 +375,7 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_store_ss(dst + 4, dst6); | |||
| dst6 = _mm_shuffle_ps(dst6, dst6, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst6); | |||
| @@ -384,7 +384,7 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_store_ss(dst + 4, dst8); | |||
| dst8 = _mm_shuffle_ps(dst8, dst8, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst8); | |||
| @@ -396,22 +396,22 @@ void MatmulFloatSse64Opt(const float *a, const float *b, float *c, const float * | |||
| break; | |||
| default: // write8 | |||
| c = dst + C8NUM; | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_store_ps(dst + 4, dst2); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_storeu_ps(dst + 4, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_store_ps(dst + 4, dst4); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_storeu_ps(dst + 4, dst4); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_store_ps(dst + 4, dst6); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_storeu_ps(dst + 4, dst6); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_store_ps(dst + 4, dst8); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_storeu_ps(dst + 4, dst8); | |||
| dst += stride; | |||
| dst += C8NUM; | |||
| } | |||
| @@ -518,27 +518,27 @@ void MatmulFloatSse64(const float *a, const float *b, float *c, const float *bia | |||
| dst8 = _mm_max_ps(dst8, zero); | |||
| } | |||
| if (WriteWino != 0) { // WriteWino | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_store_ps(dst + 4, dst2); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_storeu_ps(dst + 4, dst2); | |||
| dst += WriteWinoSteps; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_store_ps(dst + 4, dst4); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_storeu_ps(dst + 4, dst4); | |||
| dst += WriteWinoSteps; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_store_ps(dst + 4, dst6); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_storeu_ps(dst + 4, dst6); | |||
| dst += WriteWinoSteps; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_store_ps(dst + 4, dst8); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_storeu_ps(dst + 4, dst8); | |||
| dst += WriteWinoSteps; | |||
| } else if (writeNhwc == 0) { // WriteC8 | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_store_ps(dst + 4, dst2); | |||
| _mm_store_ps(dst + 8, dst3); | |||
| _mm_store_ps(dst + 12, dst4); | |||
| _mm_store_ps(dst + 16, dst5); | |||
| _mm_store_ps(dst + 20, dst6); | |||
| _mm_store_ps(dst + 24, dst7); | |||
| _mm_store_ps(dst + 28, dst8); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_storeu_ps(dst + 4, dst2); | |||
| _mm_storeu_ps(dst + 8, dst3); | |||
| _mm_storeu_ps(dst + 12, dst4); | |||
| _mm_storeu_ps(dst + 16, dst5); | |||
| _mm_storeu_ps(dst + 20, dst6); | |||
| _mm_storeu_ps(dst + 24, dst7); | |||
| _mm_storeu_ps(dst + 28, dst8); | |||
| dst += 32; | |||
| c = dst; | |||
| } else { | |||
| @@ -612,68 +612,68 @@ void MatmulFloatSse64(const float *a, const float *b, float *c, const float *bia | |||
| dst += stride; | |||
| } | |||
| case 4: // write4 | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| dst += stride; | |||
| } | |||
| case 5: // // write5 | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_store_ss(dst + 4, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_store_ss(dst + 4, dst4); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_store_ss(dst + 4, dst6); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_store_ss(dst + 4, dst8); | |||
| dst += stride; | |||
| } | |||
| case 6: // write6 | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_store_ss(dst + 4, dst2); | |||
| dst2 = _mm_shuffle_ps(dst2, dst2, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_store_ss(dst + 4, dst4); | |||
| dst4 = _mm_shuffle_ps(dst4, dst4, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst4); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_store_ss(dst + 4, dst6); | |||
| dst6 = _mm_shuffle_ps(dst6, dst6, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst6); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_store_ss(dst + 4, dst8); | |||
| dst8 = _mm_shuffle_ps(dst8, dst8, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst8); | |||
| dst += stride; | |||
| } | |||
| case 7: // write7 | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_store_ss(dst + 4, dst2); | |||
| dst2 = _mm_shuffle_ps(dst2, dst2, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst2); | |||
| @@ -681,7 +681,7 @@ void MatmulFloatSse64(const float *a, const float *b, float *c, const float *bia | |||
| _mm_store_ss(dst + 6, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_store_ss(dst + 4, dst4); | |||
| dst4 = _mm_shuffle_ps(dst4, dst4, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst4); | |||
| @@ -690,7 +690,7 @@ void MatmulFloatSse64(const float *a, const float *b, float *c, const float *bia | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_store_ss(dst + 4, dst6); | |||
| dst6 = _mm_shuffle_ps(dst6, dst6, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst6); | |||
| @@ -699,7 +699,7 @@ void MatmulFloatSse64(const float *a, const float *b, float *c, const float *bia | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_store_ss(dst + 4, dst8); | |||
| dst8 = _mm_shuffle_ps(dst8, dst8, _MM_SHUFFLE(0, 3, 2, 1)); | |||
| _mm_store_ss(dst + 5, dst8); | |||
| @@ -708,22 +708,22 @@ void MatmulFloatSse64(const float *a, const float *b, float *c, const float *bia | |||
| dst += stride; | |||
| } | |||
| default: // write8 | |||
| _mm_store_ps(dst, dst1); | |||
| _mm_store_ps(dst + 4, dst2); | |||
| _mm_storeu_ps(dst, dst1); | |||
| _mm_storeu_ps(dst + 4, dst2); | |||
| if (r > 1) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst3); | |||
| _mm_store_ps(dst + 4, dst4); | |||
| _mm_storeu_ps(dst, dst3); | |||
| _mm_storeu_ps(dst + 4, dst4); | |||
| } | |||
| if (r > 2) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst5); | |||
| _mm_store_ps(dst + 4, dst6); | |||
| _mm_storeu_ps(dst, dst5); | |||
| _mm_storeu_ps(dst + 4, dst6); | |||
| } | |||
| if (r > 3) { | |||
| dst += stride; | |||
| _mm_store_ps(dst, dst7); | |||
| _mm_store_ps(dst + 4, dst8); | |||
| _mm_storeu_ps(dst, dst7); | |||
| _mm_storeu_ps(dst + 4, dst8); | |||
| dst += stride; | |||
| } | |||
| } | |||
| @@ -0,0 +1,173 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifdef ENABLE_X86_64_SSE | |||
| #include <nmmintrin.h> | |||
| #include "nnacl/fp32/common_func.h" | |||
| void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod, | |||
| size_t plane_size, size_t stride, size_t relu_type) { | |||
| __m128 relu6 = _mm_set_ps1(6.0); | |||
| __m128 zero = _mm_setzero_ps(); | |||
| stride /= sizeof(float); | |||
| for (int loop_c8 = 0; !(loop_c8 == oc8div); loop_c8 += C8NUM) { | |||
| size_t plane_size_tmp = plane_size; | |||
| float *dst_c8 = dst + loop_c8; | |||
| __m128 bias1 = _mm_setzero_ps(); | |||
| __m128 bias2 = _mm_setzero_ps(); | |||
| if (bias != NULL) { | |||
| bias1 = _mm_loadu_ps(bias); | |||
| bias2 = _mm_loadu_ps(bias + 4); | |||
| bias += 8; | |||
| } | |||
| for (; plane_size_tmp >= C4NUM; plane_size_tmp -= C4NUM) { | |||
| __m128 src1 = _mm_loadu_ps(src); | |||
| __m128 src2 = _mm_loadu_ps(src + 4); | |||
| __m128 src3 = _mm_loadu_ps(src + 8); | |||
| __m128 src4 = _mm_loadu_ps(src + 12); | |||
| __m128 src5 = _mm_loadu_ps(src + 16); | |||
| __m128 src6 = _mm_loadu_ps(src + 20); | |||
| __m128 src7 = _mm_loadu_ps(src + 24); | |||
| __m128 src8 = _mm_loadu_ps(src + 28); | |||
| src += 32; | |||
| src1 = _mm_add_ps(src1, bias1); | |||
| src2 = _mm_add_ps(src2, bias2); | |||
| src3 = _mm_add_ps(src3, bias1); | |||
| src4 = _mm_add_ps(src4, bias2); | |||
| src5 = _mm_add_ps(src5, bias1); | |||
| src6 = _mm_add_ps(src6, bias2); | |||
| src7 = _mm_add_ps(src7, bias1); | |||
| src8 = _mm_add_ps(src8, bias2); | |||
| switch (relu_type) { | |||
| case 3: | |||
| src1 = _mm_min_ps(src1, relu6); | |||
| src2 = _mm_min_ps(src2, relu6); | |||
| src3 = _mm_min_ps(src3, relu6); | |||
| src4 = _mm_min_ps(src4, relu6); | |||
| src5 = _mm_min_ps(src5, relu6); | |||
| src6 = _mm_min_ps(src6, relu6); | |||
| src7 = _mm_min_ps(src7, relu6); | |||
| src8 = _mm_min_ps(src8, relu6); | |||
| case 1: | |||
| src1 = _mm_max_ps(src1, zero); | |||
| src2 = _mm_max_ps(src2, zero); | |||
| src3 = _mm_max_ps(src3, zero); | |||
| src4 = _mm_max_ps(src4, zero); | |||
| src5 = _mm_max_ps(src5, zero); | |||
| src6 = _mm_max_ps(src6, zero); | |||
| src7 = _mm_max_ps(src7, zero); | |||
| src8 = _mm_max_ps(src8, zero); | |||
| break; | |||
| } | |||
| _mm_storeu_ps(dst_c8, src1); | |||
| _mm_storeu_ps(dst_c8 + 4, src2); | |||
| dst_c8 += stride; | |||
| _mm_storeu_ps(dst_c8, src3); | |||
| _mm_storeu_ps(dst_c8 + 4, src4); | |||
| dst_c8 += stride; | |||
| _mm_storeu_ps(dst_c8, src5); | |||
| _mm_storeu_ps(dst_c8 + 4, src6); | |||
| dst_c8 += stride; | |||
| _mm_storeu_ps(dst_c8, src7); | |||
| _mm_storeu_ps(dst_c8 + 4, src8); | |||
| dst_c8 += stride; | |||
| } | |||
| for (; plane_size_tmp > 0; plane_size_tmp -= 1) { | |||
| __m128 src1 = _mm_loadu_ps(src); | |||
| __m128 src2 = _mm_loadu_ps(src + 4); | |||
| src1 = _mm_add_ps(src1, bias1); | |||
| src2 = _mm_add_ps(src2, bias2); | |||
| switch (relu_type) { | |||
| case 3: | |||
| src1 = _mm_min_ps(src1, relu6); | |||
| src2 = _mm_min_ps(src2, relu6); | |||
| case 1: | |||
| src1 = _mm_max_ps(src1, zero); | |||
| src2 = _mm_max_ps(src2, zero); | |||
| break; | |||
| } | |||
| _mm_storeu_ps(dst_c8, src1); | |||
| _mm_storeu_ps(dst_c8 + 4, src2); | |||
| dst_c8 += stride; | |||
| src += 8; | |||
| } | |||
| } | |||
| if (oc8mod == 0) { | |||
| return; | |||
| } | |||
| __m128 bias1 = _mm_setzero_ps(); | |||
| __m128 bias2 = _mm_setzero_ps(); | |||
| if (bias != NULL) { | |||
| bias1 = _mm_loadu_ps(bias); | |||
| bias2 = _mm_loadu_ps(bias + 4); | |||
| bias += 8; | |||
| } | |||
| float *dst_c1 = dst + oc8div; | |||
| for (size_t plane_size_tmp = plane_size; plane_size_tmp > 0; plane_size_tmp -= 1) { | |||
| __m128 src1 = _mm_loadu_ps(src); | |||
| __m128 src2 = _mm_loadu_ps(src + 4); | |||
| src += 8; | |||
| src1 = _mm_add_ps(src1, bias1); | |||
| src2 = _mm_add_ps(src2, bias2); | |||
| switch (relu_type) { | |||
| case 3: | |||
| src1 = _mm_min_ps(src1, relu6); | |||
| src2 = _mm_min_ps(src2, relu6); | |||
| case 1: | |||
| src1 = _mm_max_ps(src1, zero); | |||
| src2 = _mm_max_ps(src2, zero); | |||
| break; | |||
| } | |||
| switch (oc8mod) { | |||
| case 1: | |||
| _mm_store_ss(dst_c1, src1); | |||
| dst_c1 += stride; | |||
| break; | |||
| case 2: | |||
| _mm_storel_pi((__m64 *)(dst_c1), src1); | |||
| dst_c1 += stride; | |||
| break; | |||
| case 3: | |||
| _mm_storel_pi((__m64 *)(dst_c1), src1); | |||
| src1 = _mm_unpackhi_ps(src1, src1); | |||
| _mm_store_ss(dst_c1 + 2, src1); | |||
| dst_c1 += stride; | |||
| break; | |||
| case 4: | |||
| _mm_storeu_ps(dst_c1, src1); | |||
| dst_c1 += stride; | |||
| break; | |||
| case 5: | |||
| _mm_storeu_ps(dst_c1, src1); | |||
| _mm_store_ss(dst_c1 + 4, src2); | |||
| dst_c1 += stride; | |||
| break; | |||
| case 6: | |||
| _mm_storeu_ps(dst_c1, src1); | |||
| _mm_storel_pi((__m64 *)(dst_c1 + 4), src2); | |||
| dst_c1 += stride; | |||
| break; | |||
| case 7: | |||
| _mm_storeu_ps(dst_c1, src1); | |||
| _mm_storel_pi((__m64 *)(dst_c1 + 4), src2); | |||
| src2 = _mm_unpackhi_ps(src2, src2); | |||
| _mm_store_ss(dst_c1 + 6, src2); | |||
| dst_c1 += stride; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,258 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifdef ENABLE_X86_64_SSE | |||
| #include <nmmintrin.h> | |||
| #include "nnacl/fp32/common_func.h" | |||
| void WinogradTransLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) { | |||
| size_t len_c4 = length * 4; | |||
| size_t S_step = length * w * 4; | |||
| for (int h1 = 0; h1 < h; ++h1) { | |||
| const float *SW = S; | |||
| for (int w_tmp = w; w_tmp > 0; --w_tmp) { | |||
| const float *SK = SW; | |||
| const float *BK = B; | |||
| memset(M, 0, len_c4 * sizeof(float)); | |||
| int k_tmp = k; | |||
| for (; k_tmp >= 7; k_tmp -= 7) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| __m128 k2 = _mm_load_ps1(BK + h); | |||
| __m128 k3 = _mm_load_ps1(BK + 2 * h); | |||
| __m128 k4 = _mm_load_ps1(BK + 3 * h); | |||
| __m128 k5 = _mm_load_ps1(BK + 4 * h); | |||
| __m128 k6 = _mm_load_ps1(BK + 5 * h); | |||
| __m128 k7 = _mm_load_ps1(BK + 6 * h); | |||
| BK += 7 * h; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| __m128 s1 = _mm_loadu_ps(SK + S_step); | |||
| s1 = _mm_mul_ps(s1, k2); | |||
| __m128 s3 = _mm_loadu_ps(SK + 2 * S_step); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s3, k3)); | |||
| __m128 s4 = _mm_loadu_ps(SK + 3 * S_step); | |||
| s1 = _mm_add_ps(s1, _mm_mul_ps(s4, k4)); | |||
| __m128 s5 = _mm_loadu_ps(SK + 4 * S_step); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s5, k5)); | |||
| __m128 s6 = _mm_loadu_ps(SK + 5 * S_step); | |||
| s1 = _mm_add_ps(s1, _mm_mul_ps(s6, k6)); | |||
| __m128 s7 = _mm_loadu_ps(SK + 6 * S_step); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s7, k7)); | |||
| M1 = _mm_add_ps(M1, s1); | |||
| _mm_storeu_ps(M, M1); | |||
| M += 4; | |||
| SK += 4; | |||
| } | |||
| M -= len_c4; | |||
| SK += 7 * S_step - len_c4; | |||
| } | |||
| for (; k_tmp >= 4; k_tmp -= 4) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| __m128 k2 = _mm_load_ps1(BK + h); | |||
| __m128 k3 = _mm_load_ps1(BK + 2 * h); | |||
| __m128 k4 = _mm_load_ps1(BK + 3 * h); | |||
| BK += 4 * h; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| __m128 s1 = _mm_loadu_ps(SK + S_step); | |||
| s1 = _mm_mul_ps(s1, k2); | |||
| __m128 s3 = _mm_loadu_ps(SK + 2 * S_step); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s3, k3)); | |||
| __m128 s4 = _mm_loadu_ps(SK + 3 * S_step); | |||
| s1 = _mm_add_ps(s1, _mm_mul_ps(s4, k4)); | |||
| M1 = _mm_add_ps(M1, s1); | |||
| _mm_storeu_ps(M, M1); | |||
| SK += 4; | |||
| M += 4; | |||
| } | |||
| M -= len_c4; | |||
| SK += 4 * S_step - len_c4; | |||
| } | |||
| for (; k_tmp >= 3; k_tmp -= 3) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| __m128 k2 = _mm_load_ps1(BK + h); | |||
| __m128 k3 = _mm_load_ps1(BK + 2 * h); | |||
| BK += 3 * h; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| __m128 s1 = _mm_loadu_ps(SK + S_step); | |||
| s1 = _mm_mul_ps(s1, k2); | |||
| __m128 s3 = _mm_loadu_ps(SK + 2 * S_step); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s3, k3)); | |||
| M1 = _mm_add_ps(M1, s1); | |||
| _mm_storeu_ps(M, M1); | |||
| SK += 4; | |||
| M += 4; | |||
| } | |||
| M -= len_c4; | |||
| SK += 3 * S_step - len_c4; | |||
| } | |||
| for (; k_tmp > 0; k_tmp -= 1) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| BK += h; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| _mm_storeu_ps(M, M1); | |||
| SK += 4; | |||
| M += 4; | |||
| } | |||
| M -= len_c4; | |||
| SK += S_step - len_c4; | |||
| } | |||
| SW += len_c4; | |||
| M += len_c4; | |||
| } | |||
| B += 1; | |||
| } | |||
| } | |||
| void WinogradTransRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) { | |||
| size_t len_c4 = length * 4; | |||
| size_t k_step = len_c4 * k; | |||
| for (int h1 = 0; h1 < h; ++h1) { | |||
| const float *BW = B; | |||
| for (int ww = 0; ww < w; ++ww) { | |||
| const float *SK = S; // r0 | |||
| const float *BK = BW; // r1 | |||
| memset(M, 0, len_c4 * sizeof(float)); | |||
| int k_tmp = k; | |||
| for (; k_tmp >= 7; k_tmp -= 7) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| __m128 k2 = _mm_load_ps1(BK + h); | |||
| __m128 k3 = _mm_load_ps1(BK + 2 * h); | |||
| __m128 k4 = _mm_load_ps1(BK + 3 * h); | |||
| __m128 k5 = _mm_load_ps1(BK + 4 * h); | |||
| __m128 k6 = _mm_load_ps1(BK + 5 * h); | |||
| __m128 k7 = _mm_load_ps1(BK + 6 * h); | |||
| BK += 7 * h; | |||
| const float *S2 = SK + len_c4; | |||
| const float *S3 = S2 + len_c4; | |||
| const float *S4 = S3 + len_c4; | |||
| const float *S5 = S4 + len_c4; | |||
| const float *S6 = S5 + len_c4; | |||
| const float *S7 = S6 + len_c4; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| __m128 s1 = _mm_loadu_ps(S2); | |||
| s1 = _mm_mul_ps(s1, k2); | |||
| __m128 s3 = _mm_loadu_ps(S3); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s3, k3)); | |||
| __m128 s4 = _mm_loadu_ps(S4); | |||
| s1 = _mm_add_ps(s1, _mm_mul_ps(s4, k4)); | |||
| __m128 s5 = _mm_loadu_ps(S5); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s5, k5)); | |||
| __m128 s6 = _mm_loadu_ps(S6); | |||
| s1 = _mm_add_ps(s1, _mm_mul_ps(s6, k6)); | |||
| __m128 s7 = _mm_loadu_ps(S7); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s7, k7)); | |||
| M1 = _mm_add_ps(M1, s1); | |||
| _mm_storeu_ps(M, M1); | |||
| M += 4; | |||
| SK += 4; | |||
| S2 += 4; | |||
| S3 += 4; | |||
| S4 += 4; | |||
| S5 += 4; | |||
| S6 += 4; | |||
| S7 += 4; | |||
| } | |||
| M -= len_c4; | |||
| SK = S7; | |||
| } | |||
| for (; k_tmp >= 4; k_tmp -= 4) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| __m128 k2 = _mm_load_ps1(BK + h); | |||
| __m128 k3 = _mm_load_ps1(BK + 2 * h); | |||
| __m128 k4 = _mm_load_ps1(BK + 3 * h); | |||
| BK += 4 * h; | |||
| const float *S2 = SK + len_c4; | |||
| const float *S3 = S2 + len_c4; | |||
| const float *S4 = S3 + len_c4; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| __m128 s1 = _mm_loadu_ps(S2); | |||
| s1 = _mm_mul_ps(s1, k2); | |||
| __m128 s3 = _mm_loadu_ps(S3); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s3, k3)); | |||
| __m128 s4 = _mm_loadu_ps(S4); | |||
| s1 = _mm_add_ps(s1, _mm_mul_ps(s4, k4)); | |||
| M1 = _mm_add_ps(M1, s1); | |||
| _mm_storeu_ps(M, M1); | |||
| M += 4; | |||
| SK += 4; | |||
| S2 += 4; | |||
| S3 += 4; | |||
| S4 += 4; | |||
| } | |||
| M -= len_c4; | |||
| SK = S4; | |||
| } | |||
| for (; k_tmp >= 3; k_tmp -= 3) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| __m128 k2 = _mm_load_ps1(BK + h); | |||
| __m128 k3 = _mm_load_ps1(BK + 2 * h); | |||
| BK += 3 * h; | |||
| const float *S2 = SK + len_c4; | |||
| const float *S3 = S2 + len_c4; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| __m128 s1 = _mm_loadu_ps(S2); | |||
| s1 = _mm_mul_ps(s1, k2); | |||
| __m128 s3 = _mm_loadu_ps(S3); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s3, k3)); | |||
| M1 = _mm_add_ps(M1, s1); | |||
| _mm_storeu_ps(M, M1); | |||
| M += 4; | |||
| SK += 4; | |||
| S2 += 4; | |||
| S3 += 4; | |||
| } | |||
| M -= len_c4; | |||
| SK = S3; | |||
| } | |||
| for (; k_tmp >= 1; k_tmp -= 1) { | |||
| __m128 k1 = _mm_load_ps1(BK); | |||
| BK += h; | |||
| for (int len_tmp = length; len_tmp > 0; --len_tmp) { | |||
| __m128 M1 = _mm_loadu_ps(M); | |||
| __m128 s0 = _mm_loadu_ps(SK); | |||
| M1 = _mm_add_ps(M1, _mm_mul_ps(s0, k1)); | |||
| _mm_storeu_ps(M, M1); | |||
| M += 4; | |||
| SK += 4; | |||
| } | |||
| M -= len_c4; | |||
| } | |||
| BW += 1; | |||
| M += len_c4; | |||
| } | |||
| S += k_step; | |||
| } | |||
| } | |||
| #endif | |||
| @@ -5,6 +5,7 @@ function Run_Converter() { | |||
| # Unzip x86 runtime and convertor | |||
| cd ${x86_path} || exit 1 | |||
| tar -zxf mindspore-lite-${version}-runtime-x86-${process_unit_x86}.tar.gz || exit 1 | |||
| tar -zxf mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86}.tar.gz || exit 1 | |||
| tar -zxf mindspore-lite-${version}-converter-ubuntu.tar.gz || exit 1 | |||
| cd ${x86_path}/mindspore-lite-${version}-converter-ubuntu || exit 1 | |||
| @@ -480,6 +481,234 @@ function Run_x86() { | |||
| done < ${models_only_for_process_config} | |||
| } | |||
| # Run on x86 sse platform: | |||
| function Run_x86_sse() { | |||
| # Run tflite converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "{run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_tflite_config} | |||
| # Run caffe converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_caffe_config} | |||
| # Run onnx converted models: | |||
| while read line; do | |||
| model_name=${line%;*} | |||
| length=${#model_name} | |||
| input_shapes=${line:length+1} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --inputShapes='${input_shapes}' --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --inputShapes=${input_shapes} --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_onnx_config} | |||
| # Run tflite post training quantization converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'_posttraining.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/quantTraining/mnist_calibration_data/00099.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'_posttraining.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_posttraining.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/quantTraining/mnist_calibration_data/00099.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}_posttraining.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_tflite_posttraining_config} | |||
| # Run caffe post training quantization converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'_posttraining.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/quantTraining/ml_face_mnet_calibration_data/20_Family_Group_Family_Group_20_1001.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'_posttraining.ms.out' --accuracyThreshold=105 >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_posttraining.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/quantTraining/ml_face_mnet_calibration_data/20_Family_Group_Family_Group_20_1001.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}_posttraining.ms.out --accuracyThreshold=105 >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_caffe_posttraining_config} | |||
| # Run tflite aware training quantization converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_tflite_awaretraining_config} | |||
| # Run mindspore converted train models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name}'_train' >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'_train.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.train.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}'_train'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.train.ms.out --accuracyThreshold=1.5 >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}'_train pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}'_train failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_mindspore_train_config} | |||
| # Run mindspore converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --accuracyThreshold=1.5 >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_mindspore_config} | |||
| # Run tflite weight quantization converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_weightquant.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_tflite_weightquant_config} | |||
| # Run mindir weight quantization converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_weightquant.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.weightquant.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_mindspore_weightquant_config} | |||
| # Run mindir mixbit weight quantization converted models: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ $model_name == \#* ]]; then | |||
| continue | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "${run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'_7bit.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'_7bit.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_7bit.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}_7bit.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}'_7bit pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}'_7bit failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'_9bit.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'_9bit.ms.out' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_9bit.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}_9bit.ms.out >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}'_9bit pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}'_9bit failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_mindspore_mixbit_config} | |||
| # Run converted models which do not need to be cared about the accuracy: | |||
| while read line; do | |||
| model_name=${line} | |||
| if [[ ${line##*.} == "caffemodel" ]]; then | |||
| model_name=${line%.*} | |||
| fi | |||
| echo ${model_name} >> "${run_x86_sse_log_file}" | |||
| echo 'cd '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-sse-'${process_unit_x86} >> "{run_x86_sse_log_file}" | |||
| cd ${x86_path}/mindspore-lite-${version}-runtime-x86-sse-${process_unit_x86} || return 1 | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --loopCount=1 --warmUpLoopCount=0' >> "${run_x86_sse_log_file}" | |||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}.ms --loopCount=1 --warmUpLoopCount=0 >> "${run_x86_sse_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='x86_sse: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||
| else | |||
| run_result='x86_sse: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||
| fi | |||
| done < ${models_only_for_process_config} | |||
| } | |||
| # Run on arm64 platform: | |||
| function Run_arm64() { | |||
| # Unzip arm64 | |||
| @@ -979,6 +1208,12 @@ IFS="-" read -r -a file_name_array <<< "$file_name" | |||
| IFS="." read -r -a suffix <<< "${file_name_array[-1]}" | |||
| process_unit_x86=${suffix[0]} | |||
| x86_path=${release_path}/ubuntu_x86 | |||
| file_name=$(ls ${x86_path}/*runtime-x86-sse*.tar.gz) | |||
| IFS="-" read -r -a file_name_array <<< "$file_name" | |||
| IFS="." read -r -a suffix <<< "${file_name_array[-1]}" | |||
| process_unit_x86=${suffix[0]} | |||
| # Set models config filepath | |||
| models_tflite_config=${basepath}/models_tflite.cfg | |||
| models_caffe_config=${basepath}/models_caffe.cfg | |||
| @@ -1036,6 +1271,7 @@ else | |||
| exit 1 | |||
| fi | |||
| # Write benchmark result to temp file | |||
| run_benchmark_result_file=${basepath}/run_benchmark_result.txt | |||
| echo ' ' > ${run_benchmark_result_file} | |||
| @@ -1067,6 +1303,12 @@ Run_x86 & | |||
| Run_x86_PID=$! | |||
| sleep 1 | |||
| # Run on x86-sse | |||
| echo "start Run x86 sse ..." | |||
| Run_x86_sse & | |||
| Run_x86_sse_PID=$! | |||
| sleep 1 | |||
| # Run on arm64 | |||
| echo "start Run arm64 ..." | |||
| Run_arm64 | |||
| @@ -1099,6 +1341,16 @@ if [[ ${Run_x86_status} != 0 ]];then | |||
| exit 1 | |||
| fi | |||
| wait ${Run_x86_sse_PID} | |||
| Run_x86_sse_status=$? | |||
| if [[ ${Run_x86_sse_status} != 0 ]];then | |||
| echo "Run_x86 sse failed" | |||
| cat ${run_x86_sse_log_file} | |||
| Print_Benchmark_Result | |||
| exit 1 | |||
| fi | |||
| if [[ ${Run_arm64_status} != 0 ]];then | |||
| echo "Run_arm64 failed" | |||
| cat ${run_arm64_log_file} | |||