diff --git a/mindspore/lite/nnacl/fp32/common_func_fp32.c b/mindspore/lite/nnacl/fp32/common_func_fp32.c
index 2ec26b5e98..01914b2a5c 100644
--- a/mindspore/lite/nnacl/fp32/common_func_fp32.c
+++ b/mindspore/lite/nnacl/fp32/common_func_fp32.c
@@ -56,7 +56,7 @@ void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bi
 
 void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
                         size_t plane_size, size_t plane_stride, size_t relu_type) {
-#ifdef ENABLE_ARM
+#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
   size_t oc4mod = output_channel % C4NUM;
   size_t oc4div = output_channel - oc4mod;
   size_t stride_size = (plane_stride - plane_size) * C4NUM * sizeof(float);
diff --git a/mindspore/lite/nnacl/fp32/common_func_fp32.h b/mindspore/lite/nnacl/fp32/common_func_fp32.h
index a6b7c09cb7..898af91d64 100644
--- a/mindspore/lite/nnacl/fp32/common_func_fp32.h
+++ b/mindspore/lite/nnacl/fp32/common_func_fp32.h
@@ -50,10 +50,6 @@ void DeconvDwFp32Center(float *dst, const float *src, const float *weight, size_
                         size_t in_kh_step, size_t in_kw_step);
 void PostFuncBiasReluC8(float *dst, const float *src, const float *bias, size_t oc8div, size_t oc8mod,
                         size_t plane_size, size_t stride, size_t relu_type);
-#endif
-
-#ifdef ENABLE_ARM
-
 void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels,
                    size_t output_channel, size_t input_step);
 void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod,
diff --git a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
index 372cefa8e7..83467834bc 100644
--- a/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise_fp32.c
@@ -21,7 +21,7 @@
 #include <arm_neon.h>
 #endif
 
-#ifndef ENABLE_ARM
+#if !defined(ENABLE_ARM) && !defined(ENABLE_SSE)
 void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, int num_pixels,
                    int output_channel, int input_step) {
   for (int i = 0; i < num_pixels; i++) {
diff --git a/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c b/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c
index 1594d09c09..bc305962f9 100644
--- a/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c
+++ b/mindspore/lite/nnacl/fp32/deconv_winograd_fp32.c
@@ -161,7 +161,7 @@ void DeConvWgInputPack(const float *src_ptr, float *dst_ptr, int channel, int st
   return;
 }
 
-#ifndef ENABLE_ARM
+#if !defined(ENABLE_ARM) && !defined(ENABLE_SSE)
 void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t cal_num, size_t ic4, size_t oc4) {
   int dx, sz, dz;
   const int src_depth_step = 4 * DECONV_WINOGRAD_DEFAULT_TILE;
diff --git a/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32Row_sse.c b/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32Row_sse.c
new file mode 100644
index 0000000000..ccf1a72395
--- /dev/null
+++ b/mindspore/lite/nnacl/x86_64_sse/ConvDwFp32Row_sse.c
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef ENABLE_SSE
+#include <x86intrin.h>
+#include "nnacl/fp32/common_func_fp32.h"
+
+void ConvDwFp32Row(float *output_ptr, const float *input_ptr, const float *weight_ptr, size_t num_pixels,
+                   size_t output_channel, size_t input_step) {
+  size_t out_c16 = DOWN_DIV(output_channel, C16NUM) * C16NUM;
+  size_t out_c8 = DOWN_DIV(output_channel, C8NUM) * C8NUM;
+  size_t out_c4 = DOWN_DIV(output_channel, C4NUM) * C4NUM;
+  for (int i = 0; i < num_pixels; i++) {
+    const float *weight_tmp = weight_ptr;
+    const float *input_tmp = input_ptr;
+    size_t out_c = 0;
+    for (; out_c < out_c16; out_c += C16NUM) {
+      __m128 dst1 = _mm_loadu_ps(output_ptr);
+      __m128 dst2 = _mm_loadu_ps(output_ptr + 4);
+      __m128 dst3 = _mm_loadu_ps(output_ptr + 8);
+      __m128 dst4 = _mm_loadu_ps(output_ptr + 12);
+      __m128 w1 = _mm_loadu_ps(weight_tmp);
+      __m128 w2 = _mm_loadu_ps(weight_tmp + 4);
+      __m128 w3 = _mm_loadu_ps(weight_tmp + 8);
+      __m128 w4 = _mm_loadu_ps(weight_tmp + 12);
+      __m128 in1 = _mm_loadu_ps(input_tmp);
+      __m128 in2 = _mm_loadu_ps(input_tmp + 4);
+      __m128 in3 = _mm_loadu_ps(input_tmp + 8);
+      __m128 in4 = _mm_loadu_ps(input_tmp + 12);
+      dst1 = MS_MLAQ_F32(dst1, w1, in1);
+      dst2 = MS_MLAQ_F32(dst2, w2, in2);
+      dst3 = MS_MLAQ_F32(dst3, w3, in3);
+      dst4 = MS_MLAQ_F32(dst4, w4, in4);
+      _mm_storeu_ps(output_ptr, dst1);
+      _mm_storeu_ps(output_ptr + 4, dst2);
+      _mm_storeu_ps(output_ptr + 8, dst3);
+      _mm_storeu_ps(output_ptr + 12, dst4);
+      output_ptr += 16;
+      input_tmp += 16;
+      weight_tmp += 16;
+    }
+    for (; out_c < out_c8; out_c += C8NUM) {
+      __m128 dst1 = _mm_loadu_ps(output_ptr);
+      __m128 dst2 = _mm_loadu_ps(output_ptr + 4);
+      __m128 w1 = _mm_loadu_ps(weight_tmp);
+      __m128 w2 = _mm_loadu_ps(weight_tmp + 4);
+      __m128 in1 = _mm_loadu_ps(input_tmp);
+      __m128 in2 = _mm_loadu_ps(input_tmp + 4);
+      dst1 = MS_MLAQ_F32(dst1, w1, in1);
+      dst2 = MS_MLAQ_F32(dst2, w2, in2);
+      _mm_storeu_ps(output_ptr, dst1);
+      _mm_storeu_ps(output_ptr + 4, dst2);
+      output_ptr += 8;
+      input_tmp += 8;
+      weight_tmp += 8;
+    }
+    for (; out_c < out_c4; out_c += C4NUM) {
+      __m128 dst1 = _mm_loadu_ps(output_ptr);
+      __m128 w1 = _mm_loadu_ps(weight_tmp);
+      __m128 in1 = _mm_loadu_ps(input_tmp);
+      dst1 = MS_MLAQ_F32(dst1, w1, in1);
+      _mm_storeu_ps(output_ptr, dst1);
+      output_ptr += 4;
+      input_tmp += 4;
+      weight_tmp += 4;
+    }
+    for (; out_c < output_channel; out_c++) {
+      *output_ptr++ += weight_ptr[out_c] * input_ptr[out_c];
+    }
+    input_ptr += input_step;
+  }
+}
+#endif
diff --git a/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC4.c b/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC4.c
new file mode 100644
index 0000000000..382a2d2cb4
--- /dev/null
+++ b/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC4.c
@@ -0,0 +1,126 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef ENABLE_SSE
+#include <x86intrin.h>
+#include "nnacl/fp32/common_func_fp32.h"
+
+void PostFuncBiasReluC4(float *dst, const float *src, const float *bias, size_t oc4div, size_t oc4mod,
+                        size_t plane_size, size_t plane_stride, size_t relu_type) {
+  __m128 relu6 = _mm_set_ps1(6.0);
+  __m128 zero = _mm_setzero_ps();
+  size_t stride = oc4div + oc4mod;
+  plane_stride /= sizeof(float);
+  for (size_t loop_c4 = 0; loop_c4 < oc4div; loop_c4 += C4NUM) {
+    size_t plane_size_tmp = plane_size;
+    float *dst_c4 = dst + loop_c4;
+    __m128 bias1 = _mm_setzero_ps();
+    if (bias != NULL) {
+      bias1 = _mm_loadu_ps(bias);
+      bias += 4;
+    }
+    for (; plane_size_tmp >= C4NUM; plane_size_tmp -= C4NUM) {
+      __m128 src1 = _mm_loadu_ps(src);
+      __m128 src2 = _mm_loadu_ps(src + 4);
+      __m128 src3 = _mm_loadu_ps(src + 8);
+      __m128 src4 = _mm_loadu_ps(src + 12);
+      src += 16;
+      src1 = _mm_add_ps(src1, bias1);
+      src2 = _mm_add_ps(src2, bias1);
+      src3 = _mm_add_ps(src3, bias1);
+      src4 = _mm_add_ps(src4, bias1);
+      switch (relu_type) {
+        case 3:
+          src1 = _mm_min_ps(src1, relu6);
+          src2 = _mm_min_ps(src2, relu6);
+          src3 = _mm_min_ps(src3, relu6);
+          src4 = _mm_min_ps(src4, relu6);
+        case 1:
+          src1 = _mm_max_ps(src1, zero);
+          src2 = _mm_max_ps(src2, zero);
+          src3 = _mm_max_ps(src3, zero);
+          src4 = _mm_max_ps(src4, zero);
+          break;
+      }
+      _mm_storeu_ps(dst_c4, src1);
+      dst_c4 += stride;
+      _mm_storeu_ps(dst_c4, src2);
+      dst_c4 += stride;
+      _mm_storeu_ps(dst_c4, src3);
+      dst_c4 += stride;
+      _mm_storeu_ps(dst_c4, src4);
+      dst_c4 += stride;
+    }
+    for (; plane_size_tmp > 0; plane_size_tmp -= 1) {
+      __m128 src1 = _mm_loadu_ps(src);
+      src1 = _mm_add_ps(src1, bias1);
+      switch (relu_type) {
+        case 3:
+          src1 = _mm_min_ps(src1, relu6);
+        case 1:
+          src1 = _mm_max_ps(src1, zero);
+          break;
+      }
+      _mm_storeu_ps(dst_c4, src1);
+      dst_c4 += stride;
+      src += 4;
+    }
+    src += plane_stride;
+  }
+
+  if (oc4mod == 0) {
+    return;
+  }
+  __m128 bias1 = _mm_setzero_ps();
+  if (bias != NULL) {
+    bias1 = _mm_loadu_ps(bias);
+    bias += 4;
+  }
+  float *dst_c1 = dst + oc4div;
+  for (size_t plane_size_tmp = plane_size; plane_size_tmp > 0; plane_size_tmp -= 1) {
+    __m128 src1 = _mm_loadu_ps(src);
+    src += 4;
+    src1 = _mm_add_ps(src1, bias1);
+    switch (relu_type) {
+      case 3:
+        src1 = _mm_min_ps(src1, relu6);
+      case 1:
+        src1 = _mm_max_ps(src1, zero);
+        break;
+    }
+    switch (oc4mod) {
+      case 1:
+        _mm_store_ss(dst_c1, src1);
+        dst_c1 += stride;
+        break;
+      case 2:
+        _mm_storel_pi((__m64 *)(dst_c1), src1);
+        dst_c1 += stride;
+        break;
+      case 3:
+        _mm_storel_pi((__m64 *)(dst_c1), src1);
+        src1 = _mm_unpackhi_ps(src1, src1);
+        _mm_store_ss(dst_c1 + 2, src1);
+        dst_c1 += stride;
+        break;
+      case 4:
+        _mm_storeu_ps(dst_c1, src1);
+        dst_c1 += stride;
+        break;
+    }
+  }
+}
+#endif
diff --git a/mindspore/lite/nnacl/x86_64_sse/PosFuncBiasRelu.c b/mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC8.c
similarity index 100%
rename from mindspore/lite/nnacl/x86_64_sse/PosFuncBiasRelu.c
rename to mindspore/lite/nnacl/x86_64_sse/PostFuncBiasReluC8.c
diff --git a/mindspore/lite/nnacl/x86_64_sse/TiledC4MatMulFp32.c b/mindspore/lite/nnacl/x86_64_sse/TiledC4MatMulFp32.c
new file mode 100644
index 0000000000..2db1768ce9
--- /dev/null
+++ b/mindspore/lite/nnacl/x86_64_sse/TiledC4MatMulFp32.c
@@ -0,0 +1,175 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef ENABLE_SSE
+#include <x86intrin.h>
+#include "nnacl/fp32/common_func_fp32.h"
+void TiledC4MatmulFp32(float *dst, const float *src, const float *weight, size_t cal_num, size_t ic4, size_t oc4) {
+  const float *src_tmp = src;
+  for (int i = 0; i < oc4; ++i) {
+    float *dst_tmp = dst;
+    src = src_tmp;
+    size_t ic4_tmp = ic4 - 1;
+    __m128 src1 = _mm_loadu_ps(src);
+    __m128 src2 = _mm_loadu_ps(src + 4);
+    __m128 src3 = _mm_loadu_ps(src + 8);
+    __m128 src4 = _mm_loadu_ps(src + 12);
+    src += 16;
+    __m128 weight_data[4];
+    weight_data[0] = _mm_loadu_ps(weight);
+    weight_data[1] = _mm_loadu_ps(weight + 4);
+    weight_data[2] = _mm_loadu_ps(weight + 8);
+    weight_data[3] = _mm_loadu_ps(weight + 12);
+    weight += 16;
+    __m128 dst1 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0]));
+    __m128 dst2 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0]));
+    __m128 dst3 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0]));
+    __m128 dst4 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0]));
+    for (int j = 1; j < 4; ++j) {
+      dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[j], _mm_set_ps1(src1[j])));
+      dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[j], _mm_set_ps1(src2[j])));
+      dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[j], _mm_set_ps1(src3[j])));
+      dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[j], _mm_set_ps1(src4[j])));
+    }
+    src1 = _mm_loadu_ps(src);
+    src2 = _mm_loadu_ps(src + 4);
+    src3 = _mm_loadu_ps(src + 8);
+    src4 = _mm_loadu_ps(src + 12);
+    src += 16;
+    __m128 dst5 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0]));
+    __m128 dst6 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0]));
+    __m128 dst7 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0]));
+    __m128 dst8 = _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0]));
+    for (int j = 1; j < 4; ++j) {
+      dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[j], _mm_set_ps1(src1[j])));
+      dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[j], _mm_set_ps1(src2[j])));
+      dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[j], _mm_set_ps1(src3[j])));
+      dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[j], _mm_set_ps1(src4[j])));
+    }
+    if (ic4_tmp != 0) {
+      ic4_tmp -= 1;
+      src1 = _mm_loadu_ps(src);
+      src2 = _mm_loadu_ps(src + 4);
+      src3 = _mm_loadu_ps(src + 8);
+      src4 = _mm_loadu_ps(src + 12);
+      src += 16;
+      weight_data[0] = _mm_loadu_ps(weight);
+      weight_data[1] = _mm_loadu_ps(weight + 4);
+      weight += 8;
+
+      dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0])));
+      dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0])));
+      for (; ic4_tmp != 0; ic4_tmp -= 1) {
+        dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0])));
+        dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0])));
+
+        dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[1], _mm_set_ps1(src1[1])));
+        dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[1], _mm_set_ps1(src2[1])));
+        weight_data[2] = _mm_loadu_ps(weight);
+        weight_data[3] = _mm_loadu_ps(weight + 4);
+        weight += 8;
+        dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[1], _mm_set_ps1(src3[1])));
+        dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[1], _mm_set_ps1(src4[1])));
+
+        dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[2], _mm_set_ps1(src1[2])));
+        dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[2], _mm_set_ps1(src2[2])));
+        dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[2], _mm_set_ps1(src3[2])));
+        dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[2], _mm_set_ps1(src4[2])));
+
+        dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[3], _mm_set_ps1(src1[3])));
+        dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[3], _mm_set_ps1(src2[3])));
+        src1 = _mm_loadu_ps(src);
+        src2 = _mm_loadu_ps(src + 4);
+        dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[3], _mm_set_ps1(src3[3])));
+        dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[3], _mm_set_ps1(src4[3])));
+        src3 = _mm_loadu_ps(src + 8);
+        src4 = _mm_loadu_ps(src + 12);
+        src += 16;
+
+        dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0])));
+        dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0])));
+        dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0])));
+        dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0])));
+
+        dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[1], _mm_set_ps1(src1[1])));
+        dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[1], _mm_set_ps1(src2[1])));
+        dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[1], _mm_set_ps1(src3[1])));
+        dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[1], _mm_set_ps1(src4[1])));
+
+        dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[2], _mm_set_ps1(src1[2])));
+        dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[2], _mm_set_ps1(src2[2])));
+        dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[2], _mm_set_ps1(src3[2])));
+        weight_data[0] = _mm_loadu_ps(weight);
+        weight_data[1] = _mm_loadu_ps(weight + 4);
+        weight += 8;
+        dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[2], _mm_set_ps1(src4[2])));
+
+        dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[3], _mm_set_ps1(src1[3])));
+        dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[3], _mm_set_ps1(src2[3])));
+        dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[3], _mm_set_ps1(src3[3])));
+        src1 = _mm_loadu_ps(src);
+        src2 = _mm_loadu_ps(src + 4);
+        dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[3], _mm_set_ps1(src4[3])));
+        src3 = _mm_loadu_ps(src + 8);
+        src4 = _mm_loadu_ps(src + 12);
+        src += 16;
+
+        dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[0], _mm_set_ps1(src1[0])));
+        dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[0], _mm_set_ps1(src2[0])));
+      }
+      dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[0], _mm_set_ps1(src3[0])));
+      dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[0], _mm_set_ps1(src4[0])));
+
+      dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[1], _mm_set_ps1(src1[1])));
+      dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[1], _mm_set_ps1(src2[1])));
+      weight_data[2] = _mm_loadu_ps(weight);
+      weight_data[3] = _mm_loadu_ps(weight + 4);
+      weight += 8;
+      dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[1], _mm_set_ps1(src3[1])));
+      dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[1], _mm_set_ps1(src4[1])));
+
+      dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[2], _mm_set_ps1(src1[2])));
+      dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[2], _mm_set_ps1(src2[2])));
+      dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[2], _mm_set_ps1(src3[2])));
+      dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[2], _mm_set_ps1(src4[2])));
+
+      dst1 = _mm_add_ps(dst1, _mm_mul_ps(weight_data[3], _mm_set_ps1(src1[3])));
+      dst2 = _mm_add_ps(dst2, _mm_mul_ps(weight_data[3], _mm_set_ps1(src2[3])));
+      dst3 = _mm_add_ps(dst3, _mm_mul_ps(weight_data[3], _mm_set_ps1(src3[3])));
+      src1 = _mm_loadu_ps(src);
+      src2 = _mm_loadu_ps(src + 4);
+      dst4 = _mm_add_ps(dst4, _mm_mul_ps(weight_data[3], _mm_set_ps1(src4[3])));
+      src3 = _mm_loadu_ps(src + 8);
+      src4 = _mm_loadu_ps(src + 12);
+      src += 16;
+      for (int j = 0; j < 4; ++j) {
+        dst5 = _mm_add_ps(dst5, _mm_mul_ps(weight_data[j], _mm_set_ps1(src1[j])));
+        dst6 = _mm_add_ps(dst6, _mm_mul_ps(weight_data[j], _mm_set_ps1(src2[j])));
+        dst7 = _mm_add_ps(dst7, _mm_mul_ps(weight_data[j], _mm_set_ps1(src3[j])));
+        dst8 = _mm_add_ps(dst8, _mm_mul_ps(weight_data[j], _mm_set_ps1(src4[j])));
+      }
+    }
+    _mm_storeu_ps(dst, dst1);
+    _mm_storeu_ps(dst + 4, dst2);
+    _mm_storeu_ps(dst + 8, dst3);
+    _mm_storeu_ps(dst + 12, dst4);
+    _mm_storeu_ps(dst + 16, dst5);
+    _mm_storeu_ps(dst + 20, dst6);
+    _mm_storeu_ps(dst + 24, dst7);
+    _mm_storeu_ps(dst + 28, dst8);
+    dst = dst_tmp + cal_num;
+  }
+}
+#endif
diff --git a/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc b/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc
index dbc4a605e7..3c7f157d30 100644
--- a/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc
+++ b/mindspore/lite/src/ops/populate/tensorlistfromtensor_populate.cc
@@ -17,7 +17,7 @@
 #include "nnacl/tensorlist_parameter.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
-#include "src/ops/tensorlistfromtensor.h"
+#include "src/ops/tensorlist_fromtensor.h"
 
 namespace mindspore {
 namespace lite {
diff --git a/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc b/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc
index 4ca542724f..18c8b3508a 100644
--- a/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc
+++ b/mindspore/lite/src/ops/populate/tensorlistgetitem_populate.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/ops/tensorlistgetitem.h"
+#include "src/ops/tensorlist_getitem.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
 #include "nnacl/tensorlist_parameter.h"
diff --git a/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc b/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc
index 8504f40d3f..76a007cd02 100644
--- a/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc
+++ b/mindspore/lite/src/ops/populate/tensorlistreserve_populate.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/ops/tensorlistreserve.h"
+#include "src/ops/tensorlist_reserve.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
 #include "nnacl/tensorlist_parameter.h"
diff --git a/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc b/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc
index 163d0d9065..ab95a57d32 100644
--- a/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc
+++ b/mindspore/lite/src/ops/populate/tensorlistsetlitem_populate.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/ops/tensorlistsetitem.h"
+#include "src/ops/tensorlist_setitem.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
 #include "nnacl/tensorlist_parameter.h"
diff --git a/mindspore/lite/src/ops/populate/tensorliststack_populate.cc b/mindspore/lite/src/ops/populate/tensorliststack_populate.cc
index 88bab2207a..a06638ca24 100644
--- a/mindspore/lite/src/ops/populate/tensorliststack_populate.cc
+++ b/mindspore/lite/src/ops/populate/tensorliststack_populate.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "src/ops/tensorliststack.h"
+#include "src/ops/tensorlist_stack.h"
 #include "src/ops/primitive_c.h"
 #include "src/ops/populate/populate_register.h"
 #include "nnacl/tensorlist_parameter.h"
diff --git a/mindspore/lite/src/ops/primitive_c.cc b/mindspore/lite/src/ops/primitive_c.cc
index 822ad1d5ab..3b3de99e8c 100644
--- a/mindspore/lite/src/ops/primitive_c.cc
+++ b/mindspore/lite/src/ops/primitive_c.cc
@@ -150,11 +150,11 @@
 #include "src/ops/unsorted_segment_sum.h"
 #include "src/ops/reciprocal.h"
 #include "src/ops/constant.h"
-#include "src/ops/tensorlistfromtensor.h"
-#include "src/ops/tensorlistgetitem.h"
-#include "src/ops/tensorlistsetitem.h"
-#include "src/ops/tensorlistreserve.h"
-#include "src/ops/tensorliststack.h"
+#include "src/ops/tensorlist_fromtensor.h"
+#include "src/ops/tensorlist_getitem.h"
+#include "src/ops/tensorlist_setitem.h"
+#include "src/ops/tensorlist_reserve.h"
+#include "src/ops/tensorlist_stack.h"
 #include "src/ops/merge.h"
 #include "src/ops/switch.h"
 #include "src/ops/partial.h"
diff --git a/mindspore/lite/src/ops/tensorlistfromtensor.cc b/mindspore/lite/src/ops/tensorlist_fromtensor.cc
similarity index 98%
rename from mindspore/lite/src/ops/tensorlistfromtensor.cc
rename to mindspore/lite/src/ops/tensorlist_fromtensor.cc
index 490b975cbd..8a389ce3dd 100644
--- a/mindspore/lite/src/ops/tensorlistfromtensor.cc
+++ b/mindspore/lite/src/ops/tensorlist_fromtensor.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include <vector>
-#include "src/ops/tensorlistfromtensor.h"
+#include "src/ops/tensorlist_fromtensor.h"
 
 #ifndef PRIMITIVE_WRITEABLE
 #include "src/ops/ops_register.h"
@@ -133,7 +133,6 @@ int TensorListFromTensor::InferShape(std::vector<lite::Tensor *> inputs_, std::v
   auto ele_shape_ptr = reinterpret_cast<int *>(input1->data_c());
   auto output = reinterpret_cast<TensorList *>(outputs_[0]);
   MS_ASSERT(output != nullptr);
-  // output->set_tensors_data_type(input0->data_type());
   std::vector<std::vector<int> > tensor_shape(dim0, std::vector<int>(input0_shape.begin() + 1, input0_shape.end()));
   output->set_element_shape(std::vector<int>(ele_shape_ptr, ele_shape_ptr + input1->ElementsNum()));
   output->set_shape(std::vector<int>(1, dim0));
diff --git a/mindspore/lite/src/ops/tensorlistfromtensor.h b/mindspore/lite/src/ops/tensorlist_fromtensor.h
similarity index 100%
rename from mindspore/lite/src/ops/tensorlistfromtensor.h
rename to mindspore/lite/src/ops/tensorlist_fromtensor.h
diff --git a/mindspore/lite/src/ops/tensorlistgetitem.cc b/mindspore/lite/src/ops/tensorlist_getitem.cc
similarity index 99%
rename from mindspore/lite/src/ops/tensorlistgetitem.cc
rename to mindspore/lite/src/ops/tensorlist_getitem.cc
index 1f68c49975..065c3e8e90 100644
--- a/mindspore/lite/src/ops/tensorlistgetitem.cc
+++ b/mindspore/lite/src/ops/tensorlist_getitem.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include <vector>
-#include "src/ops/tensorlistgetitem.h"
+#include "src/ops/tensorlist_getitem.h"
 
 #ifndef PRIMITIVE_WRITEABLE
 #include "src/ops/ops_register.h"
diff --git a/mindspore/lite/src/ops/tensorlistgetitem.h b/mindspore/lite/src/ops/tensorlist_getitem.h
similarity index 100%
rename from mindspore/lite/src/ops/tensorlistgetitem.h
rename to mindspore/lite/src/ops/tensorlist_getitem.h
diff --git a/mindspore/lite/src/ops/tensorlistreserve.cc b/mindspore/lite/src/ops/tensorlist_reserve.cc
similarity index 99%
rename from mindspore/lite/src/ops/tensorlistreserve.cc
rename to mindspore/lite/src/ops/tensorlist_reserve.cc
index 058ff0d2b4..fe7c0e66a7 100644
--- a/mindspore/lite/src/ops/tensorlistreserve.cc
+++ b/mindspore/lite/src/ops/tensorlist_reserve.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include <vector>
-#include "src/ops/tensorlistreserve.h"
+#include "src/ops/tensorlist_reserve.h"
 
 #ifndef PRIMITIVE_WRITEABLE
 #include "src/ops/ops_register.h"
diff --git a/mindspore/lite/src/ops/tensorlistreserve.h b/mindspore/lite/src/ops/tensorlist_reserve.h
similarity index 100%
rename from mindspore/lite/src/ops/tensorlistreserve.h
rename to mindspore/lite/src/ops/tensorlist_reserve.h
diff --git a/mindspore/lite/src/ops/tensorlistsetitem.cc b/mindspore/lite/src/ops/tensorlist_setitem.cc
similarity index 99%
rename from mindspore/lite/src/ops/tensorlistsetitem.cc
rename to mindspore/lite/src/ops/tensorlist_setitem.cc
index 5626a877e2..34969c44ed 100644
--- a/mindspore/lite/src/ops/tensorlistsetitem.cc
+++ b/mindspore/lite/src/ops/tensorlist_setitem.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include <vector>
-#include "src/ops/tensorlistsetitem.h"
+#include "src/ops/tensorlist_setitem.h"
 
 #ifndef PRIMITIVE_WRITEABLE
 #include "src/ops/ops_register.h"
diff --git a/mindspore/lite/src/ops/tensorlistsetitem.h b/mindspore/lite/src/ops/tensorlist_setitem.h
similarity index 100%
rename from mindspore/lite/src/ops/tensorlistsetitem.h
rename to mindspore/lite/src/ops/tensorlist_setitem.h
diff --git a/mindspore/lite/src/ops/tensorliststack.cc b/mindspore/lite/src/ops/tensorlist_stack.cc
similarity index 99%
rename from mindspore/lite/src/ops/tensorliststack.cc
rename to mindspore/lite/src/ops/tensorlist_stack.cc
index 00d564f886..3c162e0c2a 100644
--- a/mindspore/lite/src/ops/tensorliststack.cc
+++ b/mindspore/lite/src/ops/tensorlist_stack.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include <vector>
-#include "src/ops/tensorliststack.h"
+#include "src/ops/tensorlist_stack.h"
 
 #ifndef PRIMITIVE_WRITEABLE
 #include "src/ops/ops_register.h"
diff --git a/mindspore/lite/src/ops/tensorliststack.h b/mindspore/lite/src/ops/tensorlist_stack.h
similarity index 100%
rename from mindspore/lite/src/ops/tensorliststack.h
rename to mindspore/lite/src/ops/tensorlist_stack.h
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.cc
similarity index 98%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.cc
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.cc
index 5351e69b4f..a2ffd8f78d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.cc
@@ -15,7 +15,7 @@
  */
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/fp32/TensorListFromTensor.h"
+#include "src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.h"
 #include "src/runtime/runtime_api.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.h
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListFromTensor.h
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_fromtensor_fp32.h
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.cc
similarity index 98%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.cc
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.cc
index 82e1225bda..affa8f7545 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.cc
@@ -16,7 +16,7 @@
 #include "include/errorcode.h"
 #include "include/ms_tensor.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/fp32/TensorListGetItem.h"
+#include "src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.h"
 #include "src/runtime/runtime_api.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.h
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListGetItem.h
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_getitem_fp32.h
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.cc
similarity index 97%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.cc
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.cc
index 2c958b7650..eb5caeb1ed 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.cc
@@ -16,7 +16,7 @@
 #include <vector>
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/fp32/TensorListReserve.h"
+#include "src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.h
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListReserve.h
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_reserve_fp32.h
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.cc
similarity index 98%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.cc
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.cc
index 63a0cfadd1..fcaa460f3e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.cc
@@ -16,7 +16,7 @@
 #include "include/errorcode.h"
 #include "include/ms_tensor.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/fp32/TensorListSetItem.h"
+#include "src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.h"
 #include "src/runtime/runtime_api.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.h
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListSetItem.h
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_setitem_fp32.h
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.cc
similarity index 99%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.cc
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.cc
index 91ad3f9956..ec04883bc9 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.cc
@@ -19,7 +19,7 @@
 #include "include/errorcode.h"
 #include "ir/dtype/type_id.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/fp32/TensorListStack.h"
+#include "src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.h b/mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.h
similarity index 100%
rename from mindspore/lite/src/runtime/kernel/arm/fp32/TensorListStack.h
rename to mindspore/lite/src/runtime/kernel/arm/fp32/tensorlist_stack_fp32.h