diff --git a/mindspore/lite/nnacl/fp32/add_fp32.c b/mindspore/lite/nnacl/fp32/add_fp32.c
new file mode 100644
index 0000000000..11b839766a
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/add_fp32.c
@@ -0,0 +1,225 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/add_fp32.h"
+#include "nnacl/fp32/arithmetic_fp32.h"
+
+int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vaddq_f32(vin0_opt, vin1);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] + in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vaddq_f32(vin0, vin1_opt);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] + in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
+      int32x4_t vout = vaddq_s32(vin0_opt, vin1);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] + in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
+      int32x4_t vout = vaddq_s32(vin0, vin1_opt);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] + in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] + in1[index], 0);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] + in1[0], 0);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
+    }
+  }
+
+  return NNACL_OK;
+}
+
+int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementAdd(tile_in0, tile_in1, out, size);
+}
+
+int ElementAdd(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vaddq_f32(vin0, vin1);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] + in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vaddq_f32(vin0, vin1);
+    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    float res = in0[index] + in1[index];
+    out[index] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
+  }
+  return NNACL_OK;
+}
+
+int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
+    int32x4_t vout = vaddq_s32(vin0, vin1);
+    vst1q_s32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] + in1[index];
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/fp32/add_fp32.h b/mindspore/lite/nnacl/fp32/add_fp32.h
new file mode 100644
index 0000000000..4344f33175
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/add_fp32.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_ADD_H_
+#define MINDSPORE_LITE_NNACL_FP32_ADD_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ElementAdd(const float *in0, const float *in1, float *out, int size);
+int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
+int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementAddInt(const int *in0, const int *in1, int *out, int size);
+int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_ADD_H_
diff --git a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
index 73583bd4bc..cda6db6bce 100644
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
@@ -19,812 +19,6 @@
 
 #define ACCURACY_DATA 0.00000001
 
-int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vmulq_f32(vin0_opt, vin1);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[0] * in1[index];
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vmulq_f32(vin0, vin1_opt);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[index] * in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[0] * in1[index], 0);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[index] * in1[0], 0);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  float32x4_t bounds = vdupq_n_f32(6.0f);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(in1 + index);
-      int32x4_t vout = vmulq_s32(vin0_opt, vin1);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[0] * in1[index];
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(in0 + index);
-      int32x4_t vout = vmulq_s32(vin0, vin1_opt);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[index] * in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
-  int32x4_t zeros = vdupq_n_s32(0);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(in1 + index);
-      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[0] * in1[index], 0);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(in0 + index);
-      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[index] * in1[0], 0);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
-  int32x4_t zeros = vdupq_n_s32(0);
-  int32x4_t bounds = vdupq_n_s32(6);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(in1 + index);
-      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(in0 + index);
-      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vsubq_f32(vin0_opt, vin1);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[0] - in1[index];
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vsubq_f32(vin0, vin1_opt);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[index] - in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(in1 + index);
-      int32x4_t vout = vsubq_s32(vin0_opt, vin1);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[0] - in1[index];
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(in0 + index);
-      int32x4_t vout = vsubq_s32(vin0, vin1_opt);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[index] - in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[0] - in1[index], 0);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[index] - in1[0], 0);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  float32x4_t bounds = vdupq_n_f32(6.0f);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vaddq_f32(vin0_opt, vin1);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[0] + in1[index];
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vaddq_f32(vin0, vin1_opt);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[index] + in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
-  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin1 = vld1q_s32(in1 + index);
-      int32x4_t vout = vaddq_s32(vin0_opt, vin1);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[0] + in1[index];
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      int32x4_t vin0 = vld1q_s32(in0 + index);
-      int32x4_t vout = vaddq_s32(vin0, vin1_opt);
-      vst1q_s32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = in0[index] + in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[0] + in1[index], 0);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMAX(in0[index] + in1[0], 0);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-#ifdef ENABLE_NEON
-  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
-  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  float32x4_t bounds = vdupq_n_f32(6.0f);
-#endif
-  int index = 0;
-  if (param->in_elements_num0_ == 1) {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin1 = vld1q_f32(in1 + index);
-      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
-    }
-  } else {
-#ifdef ENABLE_NEON
-    for (; index <= size - 4; index += C4NUM) {
-      float32x4_t vin0 = vld1q_f32(in0 + index);
-      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
-      vst1q_f32(out + index, vout);
-    }
-#endif
-    for (; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
-    }
-  }
-
-  return NNACL_OK;
-}
-
-int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-  if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < size; index++) {
-      out[index] = in0[0] / in1[index];
-    }
-  } else {
-    if (in1[0] == 0) {
-      return NNACL_ERRCODE_DIVISOR_ZERO;
-    }
-    for (int index = 0; index < size; index++) {
-      out[index] = in0[index] / in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-  if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < size; index++) {
-      out[index] = in0[0] / in1[index];
-      out[index] = out[index] > 0 ? out[index] : 0;
-    }
-  } else {
-    for (int index = 0; index < size; index++) {
-      out[index] = in0[index] / in1[0];
-      out[index] = out[index] > 0 ? out[index] : 0;
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
-  if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
-    }
-  } else {
-    for (int index = 0; index < size; index++) {
-      out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
-    }
-  }
-  return NNACL_OK;
-}
-
-int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
-  if (param->in_elements_num0_ == 1) {
-    for (int index = 0; index < size; index++) {
-      out[index] = in0[0] / in1[index];
-    }
-  } else {
-    if (in1[0] == 0) {
-      return NNACL_ERRCODE_DIVISOR_ZERO;
-    }
-    for (int index = 0; index < size; index++) {
-      out[index] = in0[index] / in1[0];
-    }
-  }
-  return NNACL_OK;
-}
-
-int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param) {
-  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
-  return ElementAdd(tile_in0, tile_in1, out, size);
-}
-
-int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param) {
-  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
-  return ElementMul(tile_in0, tile_in1, out, size);
-}
-
-int ElementMul(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vmulq_f32(vin0, vin1);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = in0[index] * in1[index];
-  }
-  return NNACL_OK;
-}
-
-int ElementMulRelu(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vmulq_f32(vin0, vin1);
-    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    float res = in0[index] * in1[index];
-    out[index] = res > 0 ? res : 0;
-  }
-  return NNACL_OK;
-}
-
-int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  float32x4_t bounds = vdupq_n_f32(6.0f);
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
-  }
-  return NNACL_OK;
-}
-
-int ElementMulInt(const int *in0, const int *in1, int *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  for (; index <= size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(in0 + index);
-    int32x4_t vin1 = vld1q_s32(in1 + index);
-    int32x4_t vout = vmulq_s32(vin0, vin1);
-    vst1q_s32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = in0[index] * in1[index];
-  }
-  return NNACL_OK;
-}
-
-int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  int32x4_t zeros = vdupq_n_s32(0);
-  for (; index <= size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(in0 + index);
-    int32x4_t vin1 = vld1q_s32(in1 + index);
-    int32x4_t vout = vmulq_s32(vin0, vin1);
-    vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
-    vst1q_s32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    float res = in0[index] * in1[index];
-    out[index] = res > 0 ? res : 0;
-  }
-  return NNACL_OK;
-}
-
-int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  int32x4_t zeros = vdupq_n_s32(0);
-  int32x4_t bounds = vdupq_n_s32(6);
-  for (; index <= size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(in0 + index);
-    int32x4_t vin1 = vld1q_s32(in1 + index);
-    int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
-    vst1q_s32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
-  }
-  return NNACL_OK;
-}
-
-int ElementAdd(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vaddq_f32(vin0, vin1);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = in0[index] + in1[index];
-  }
-  return NNACL_OK;
-}
-
-int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vaddq_f32(vin0, vin1);
-    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    float res = in0[index] + in1[index];
-    out[index] = res > 0 ? res : 0;
-  }
-  return NNACL_OK;
-}
-
-int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  float32x4_t bounds = vdupq_n_f32(6.0f);
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
-  }
-  return NNACL_OK;
-}
-
-int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  for (; index <= size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(in0 + index);
-    int32x4_t vin1 = vld1q_s32(in1 + index);
-    int32x4_t vout = vaddq_s32(vin0, vin1);
-    vst1q_s32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = in0[index] + in1[index];
-  }
-  return NNACL_OK;
-}
-
-int ElementSub(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vsubq_f32(vin0, vin1);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = in0[index] - in1[index];
-  }
-  return NNACL_OK;
-}
-
-int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  for (; index <= size - 4; index += C4NUM) {
-    int32x4_t vin0 = vld1q_s32(in0 + index);
-    int32x4_t vin1 = vld1q_s32(in1 + index);
-    int32x4_t vout = vsubq_s32(vin0, vin1);
-    vst1q_s32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = in0[index] - in1[index];
-  }
-  return NNACL_OK;
-}
-
-int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vsubq_f32(vin0, vin1);
-    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    float res = in0[index] - in1[index];
-    out[index] = res > 0 ? res : 0;
-  }
-  return NNACL_OK;
-}
-
-int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
-  int index = 0;
-#ifdef ENABLE_NEON
-  float32x4_t zeros = vdupq_n_f32(0.0f);
-  float32x4_t bounds = vdupq_n_f32(6.0f);
-  for (; index <= size - 4; index += C4NUM) {
-    float32x4_t vin0 = vld1q_f32(in0 + index);
-    float32x4_t vin1 = vld1q_f32(in1 + index);
-    float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
-    vst1q_f32(out + index, vout);
-  }
-#endif
-  for (; index < size; index++) {
-    out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
-  }
-
-  return NNACL_OK;
-}
-
-int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param) {
-  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
-  return ElementDiv(tile_in0, tile_in1, out, size);
-}
-
-int ElementDiv(const float *in0, const float *in1, float *out, int size) {
-  for (int i = 0; i < size; i++) {
-    out[i] = in0[i] / in1[i];
-  }
-  return NNACL_OK;
-}
-
-int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
-  for (int i = 0; i < size; i++) {
-    float res = in0[i] / in1[i];
-    out[i] = res > 0 ? res : 0;
-  }
-  return NNACL_OK;
-}
-
-int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
-  for (int i = 0; i < size; i++) {
-    out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
-  }
-  return NNACL_OK;
-}
-
 int ElementFloorMod(const float *in0, const float *in1, float *out, int size) {
   for (int i = 0; i < size; i++) {
     out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i];
@@ -929,11 +123,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size)
   return NNACL_OK;
 }
 
-int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
-  ElementSub(in0, in1, out, size);
-  return ElementMul(out, out, out, size);
-}
-
 int ElementLogicalOr(const float *in0, const float *in1, float *out, int size) {
   int index = 0;
 #ifdef ENABLE_NEON
diff --git a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
index f076e40459..1580ea3b9f 100644
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
@@ -22,6 +22,11 @@
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"
+#include "nnacl/fp32/add_fp32.h"
+#include "nnacl/fp32/mul_fp32.h"
+#include "nnacl/fp32/div_fp32.h"
+#include "nnacl/fp32/sub_fp32.h"
+#include "nnacl/fp32/squared_difference.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,56 +35,6 @@ void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t n
                           const int *inStrides, const int *outStrides, const int *multiple);
 void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
                         ArithmeticParameter *param);
-
-/* Mul */
-int ElementMul(const float *in0, const float *in1, float *out, int size);
-int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
-int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementMulInt(const int *in0, const int *in1, int *out, int size);
-int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
-int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
-int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param);
-
-/* Add */
-int ElementAdd(const float *in0, const float *in1, float *out, int size);
-int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
-int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementAddInt(const int *in0, const int *in1, int *out, int size);
-int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param);
-
-/* Sub */
-int ElementSub(const float *in0, const float *in1, float *out, int size);
-int ElementSubInt(const int *in0, const int *in1, int *out, int size);
-int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
-int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-
-/* Div */
-int ElementDiv(const float *in0, const float *in1, float *out, int size);
-int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
-int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
-int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
-int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
-int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
-                 ArithmeticParameter *param);
-
 /* logical and */
 int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size);
 int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size);
@@ -88,9 +43,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size)
 /* logical or */
 int ElementLogicalOr(const float *in0, const float *in1, float *out, int size);
 
-/* Element Squared Difference */
-int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);
-
 /* max min */
 int ElementMaximum(const float *in0, const float *in1, float *out, int size);
 int ElementMinimum(const float *in0, const float *in1, float *out, int size);
diff --git a/mindspore/lite/nnacl/fp32/div_fp32.c b/mindspore/lite/nnacl/fp32/div_fp32.c
new file mode 100644
index 0000000000..9d1f665ea1
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/div_fp32.c
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/div_fp32.h"
+#include <math.h>
+#include "nnacl/fp32/arithmetic_fp32.h"
+
+int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
+    }
+  } else {
+    if (in1[0] == 0) {
+      return NNACL_ERRCODE_DIVISOR_ZERO;
+    }
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
+      out[index] = out[index] > 0 ? out[index] : 0;
+    }
+  } else {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
+      out[index] = out[index] > 0 ? out[index] : 0;
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
+    }
+  } else {
+    for (int index = 0; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+  if (param->in_elements_num0_ == 1) {
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[0] / in1[index];
+    }
+  } else {
+    if (in1[0] == 0) {
+      return NNACL_ERRCODE_DIVISOR_ZERO;
+    }
+    for (int index = 0; index < size; index++) {
+      out[index] = in0[index] / in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementDiv(tile_in0, tile_in1, out, size);
+}
+
+int ElementDiv(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = in0[i] / in1[i];
+  }
+  return NNACL_OK;
+}
+
+int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    float res = in0[i] / in1[i];
+    out[i] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
+  for (int i = 0; i < size; i++) {
+    out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/fp32/div_fp32.h b/mindspore/lite/nnacl/fp32/div_fp32.h
new file mode 100644
index 0000000000..755d678d55
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/div_fp32.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_DIV_H_
+#define MINDSPORE_LITE_NNACL_FP32_DIV_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int ElementDiv(const float *in0, const float *in1, float *out, int size);
+int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
+int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_DIV_H_
diff --git a/mindspore/lite/nnacl/fp32/lstm_fp32.c b/mindspore/lite/nnacl/fp32/lstm_fp32.c
index fd615201bd..7c88d8707e 100644
--- a/mindspore/lite/nnacl/fp32/lstm_fp32.c
+++ b/mindspore/lite/nnacl/fp32/lstm_fp32.c
@@ -19,6 +19,7 @@
 #include <float.h>
 #include "nnacl/fp32/activation_fp32.h"
 #include "nnacl/fp32/arithmetic_fp32.h"
+#include "nnacl/fp32/mul_fp32.h"
 
 void InitGate(float *gate_buffer, const float *bias, const LstmParameter *lstm_parm) {
   int gate_offest = 0;
diff --git a/mindspore/lite/nnacl/fp32/mul_fp32.c b/mindspore/lite/nnacl/fp32/mul_fp32.c
new file mode 100644
index 0000000000..4fea298205
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/mul_fp32.c
@@ -0,0 +1,327 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/fp32/mul_fp32.h"
+#include "nnacl/fp32/arithmetic_fp32.h"
+
+int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param) {
+  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
+  return ElementMul(tile_in0, tile_in1, out, size);
+}
+
+int ElementMul(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vmulq_f32(vin0, vin1);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] * in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementMulRelu(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vmulq_f32(vin0, vin1);
+    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    float res = in0[index] * in1[index];
+    out[index] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
+  }
+  return NNACL_OK;
+}
+
+int ElementMulInt(const int *in0, const int *in1, int *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
+    int32x4_t vout = vmulq_s32(vin0, vin1);
+    vst1q_s32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] * in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  int32x4_t zeros = vdupq_n_s32(0);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
+    int32x4_t vout = vmulq_s32(vin0, vin1);
+    vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
+    vst1q_s32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    float res = in0[index] * in1[index];
+    out[index] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  int32x4_t zeros = vdupq_n_s32(0);
+  int32x4_t bounds = vdupq_n_s32(6);
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
+    int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
+    vst1q_s32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
+  }
+  return NNACL_OK;
+}
+
+int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vmulq_f32(vin0_opt, vin1);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] * in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vmulq_f32(vin0, vin1_opt);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] * in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] * in1[index], 0);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] * in1[0], 0);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
+      int32x4_t vout = vmulq_s32(vin0_opt, vin1);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] * in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
+      int32x4_t vout = vmulq_s32(vin0, vin1_opt);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] * in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
+  int32x4_t zeros = vdupq_n_s32(0);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
+      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] * in1[index], 0);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
+      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] * in1[0], 0);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
+  int32x4_t zeros = vdupq_n_s32(0);
+  int32x4_t bounds = vdupq_n_s32(6);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
+      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
+      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
+    }
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/fp32/mul_fp32.h b/mindspore/lite/nnacl/fp32/mul_fp32.h
new file mode 100644
index 0000000000..0ff54a43c8
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/mul_fp32.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_MUL_H_
+#define MINDSPORE_LITE_NNACL_FP32_MUL_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ElementMul(const float *in0, const float *in1, float *out, int size);
+int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
+int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementMulInt(const int *in0, const int *in1, int *out, int size);
+int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
+int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
+int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
+                 ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_MUL_H_
diff --git a/mindspore/lite/nnacl/fp32/squared_difference.c b/mindspore/lite/nnacl/fp32/squared_difference.c
new file mode 100644
index 0000000000..0340329009
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/squared_difference.c
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+
+#include "nnacl/fp32/squared_difference.h"
+#include "nnacl/fp32/sub_fp32.h"
+#include "nnacl/fp32/mul_fp32.h"
+
+int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
+  ElementSub(in0, in1, out, size);
+  return ElementMul(out, out, out, size);
+}
+
+#endif  // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
diff --git a/mindspore/lite/nnacl/fp32/squared_difference.h b/mindspore/lite/nnacl/fp32/squared_difference.h
new file mode 100644
index 0000000000..71098b5525
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/squared_difference.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Element Squared Difference */
+int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
diff --git a/mindspore/lite/nnacl/fp32/sub_fp32.c b/mindspore/lite/nnacl/fp32/sub_fp32.c
new file mode 100644
index 0000000000..e2dfcb7fdb
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/sub_fp32.c
@@ -0,0 +1,217 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nnacl/fp32/sub_fp32.h"
+
+int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vsubq_f32(vin0_opt, vin1);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] - in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vsubq_f32(vin0, vin1_opt);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] - in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
+  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin1 = vld1q_s32(in1 + index);
+      int32x4_t vout = vsubq_s32(vin0_opt, vin1);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[0] - in1[index];
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      int32x4_t vin0 = vld1q_s32(in0 + index);
+      int32x4_t vout = vsubq_s32(vin0, vin1_opt);
+      vst1q_s32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = in0[index] - in1[0];
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[0] - in1[index], 0);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMAX(in0[index] - in1[0], 0);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
+#ifdef ENABLE_NEON
+  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
+  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+#endif
+  int index = 0;
+  if (param->in_elements_num0_ == 1) {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin1 = vld1q_f32(in1 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
+    }
+  } else {
+#ifdef ENABLE_NEON
+    for (; index <= size - 4; index += C4NUM) {
+      float32x4_t vin0 = vld1q_f32(in0 + index);
+      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
+      vst1q_f32(out + index, vout);
+    }
+#endif
+    for (; index < size; index++) {
+      out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
+    }
+  }
+  return NNACL_OK;
+}
+
+int ElementSub(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vsubq_f32(vin0, vin1);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] - in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  for (; index <= size - 4; index += C4NUM) {
+    int32x4_t vin0 = vld1q_s32(in0 + index);
+    int32x4_t vin1 = vld1q_s32(in1 + index);
+    int32x4_t vout = vsubq_s32(vin0, vin1);
+    vst1q_s32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = in0[index] - in1[index];
+  }
+  return NNACL_OK;
+}
+
+int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vsubq_f32(vin0, vin1);
+    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    float res = in0[index] - in1[index];
+    out[index] = res > 0 ? res : 0;
+  }
+  return NNACL_OK;
+}
+
+int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
+  int index = 0;
+#ifdef ENABLE_NEON
+  float32x4_t zeros = vdupq_n_f32(0.0f);
+  float32x4_t bounds = vdupq_n_f32(6.0f);
+  for (; index <= size - 4; index += C4NUM) {
+    float32x4_t vin0 = vld1q_f32(in0 + index);
+    float32x4_t vin1 = vld1q_f32(in1 + index);
+    float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
+    vst1q_f32(out + index, vout);
+  }
+#endif
+  for (; index < size; index++) {
+    out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
+  }
+
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/fp32/sub_fp32.h b/mindspore/lite/nnacl/fp32/sub_fp32.h
new file mode 100644
index 0000000000..b846417190
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/sub_fp32.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_SUB_FP32_H_
+#define MINDSPORE_LITE_NNACL_SUB_FP32_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "nnacl/op_base.h"
+#include "nnacl/base/arithmetic_base.h"
+#include "nnacl/errorcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ElementSub(const float *in0, const float *in1, float *out, int size);
+int ElementSubInt(const int *in0, const int *in1, int *out, int size);
+int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
+int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
+int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
+int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_SUB_FP32_H_