move mul/add/div/sub from arithmetic_fp32

5 years ago · fe90be3bb4
--- a/mindspore/lite/nnacl/fp32/add_fp32.c
+++ b/mindspore/lite/nnacl/fp32/add_fp32.c
@@ -0,0 +1,225 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "nnacl/fp32/add_fp32.h"
 #include "nnacl/fp32/arithmetic_fp32.h"

 int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vaddq_f32(vin0_opt, vin1);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] + in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vaddq_f32(vin0, vin1_opt);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] + in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vaddq_s32(vin0_opt, vin1);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] + in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vaddq_s32(vin0, vin1_opt);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] + in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] + in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] + in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
    }
  }

  return NNACL_OK;
 }

 int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param) {
  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
  return ElementAdd(tile_in0, tile_in1, out, size);
 }

 int ElementAdd(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vaddq_f32(vin0, vin1);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] + in1[index];
  }
  return NNACL_OK;
 }

 int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vaddq_f32(vin0, vin1);
    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] + in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
  }
  return NNACL_OK;
 }

 int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vaddq_s32(vin0, vin1);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] + in1[index];
  }
  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp32/add_fp32.h
+++ b/mindspore/lite/nnacl/fp32/add_fp32.h
@@ -0,0 +1,45 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_NNACL_FP32_ADD_H_
 #define MINDSPORE_LITE_NNACL_FP32_ADD_H_

 #ifdef ENABLE_NEON
 #include <arm_neon.h>
 #endif
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 int ElementAdd(const float *in0, const float *in1, float *out, int size);
 int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
 int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementAddInt(const int *in0, const int *in1, int *out, int size);
 int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param);

 #ifdef __cplusplus
 }
 #endif

 #endif  // MINDSPORE_LITE_NNACL_FP32_ADD_H_
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c
@@ -19,812 +19,6 @@

 #define ACCURACY_DATA 0.00000001

 int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmulq_f32(vin0_opt, vin1);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] * in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmulq_f32(vin0, vin1_opt);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] * in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] * in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] * in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vmulq_s32(vin0_opt, vin1);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] * in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vmulq_s32(vin0, vin1_opt);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] * in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
  int32x4_t zeros = vdupq_n_s32(0);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] * in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] * in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
  int32x4_t zeros = vdupq_n_s32(0);
  int32x4_t bounds = vdupq_n_s32(6);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }

 int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vsubq_f32(vin0_opt, vin1);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] - in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vsubq_f32(vin0, vin1_opt);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] - in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vsubq_s32(vin0_opt, vin1);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] - in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vsubq_s32(vin0, vin1_opt);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] - in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] - in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] - in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }

 int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vaddq_f32(vin0_opt, vin1);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] + in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vaddq_f32(vin0, vin1_opt);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] + in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vaddq_s32(vin0_opt, vin1);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] + in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vaddq_s32(vin0, vin1_opt);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] + in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] + in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] + in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
    }
  }

  return NNACL_OK;
 }

 int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = in0[0] / in1[index];
    }
  } else {
    if (in1[0] == 0) {
      return NNACL_ERRCODE_DIVISOR_ZERO;
    }
    for (int index = 0; index < size; index++) {
      out[index] = in0[index] / in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = in0[0] / in1[index];
      out[index] = out[index] > 0 ? out[index] : 0;
    }
  } else {
    for (int index = 0; index < size; index++) {
      out[index] = in0[index] / in1[0];
      out[index] = out[index] > 0 ? out[index] : 0;
    }
  }
  return NNACL_OK;
 }

 int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
    }
  } else {
    for (int index = 0; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }

 int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = in0[0] / in1[index];
    }
  } else {
    if (in1[0] == 0) {
      return NNACL_ERRCODE_DIVISOR_ZERO;
    }
    for (int index = 0; index < size; index++) {
      out[index] = in0[index] / in1[0];
    }
  }
  return NNACL_OK;
 }

 int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param) {
  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
  return ElementAdd(tile_in0, tile_in1, out, size);
 }

 int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param) {
  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
  return ElementMul(tile_in0, tile_in1, out, size);
 }

 int ElementMul(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vmulq_f32(vin0, vin1);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] * in1[index];
  }
  return NNACL_OK;
 }

 int ElementMulRelu(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vmulq_f32(vin0, vin1);
    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] * in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
  }
  return NNACL_OK;
 }

 int ElementMulInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vmulq_s32(vin0, vin1);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] * in1[index];
  }
  return NNACL_OK;
 }

 int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  int32x4_t zeros = vdupq_n_s32(0);
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vmulq_s32(vin0, vin1);
    vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] * in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  int32x4_t zeros = vdupq_n_s32(0);
  int32x4_t bounds = vdupq_n_s32(6);
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
  }
  return NNACL_OK;
 }

 int ElementAdd(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vaddq_f32(vin0, vin1);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] + in1[index];
  }
  return NNACL_OK;
 }

 int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vaddq_f32(vin0, vin1);
    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] + in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
  }
  return NNACL_OK;
 }

 int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vaddq_s32(vin0, vin1);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] + in1[index];
  }
  return NNACL_OK;
 }

 int ElementSub(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vsubq_f32(vin0, vin1);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] - in1[index];
  }
  return NNACL_OK;
 }

 int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vsubq_s32(vin0, vin1);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] - in1[index];
  }
  return NNACL_OK;
 }

 int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vsubq_f32(vin0, vin1);
    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] - in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
  }

  return NNACL_OK;
 }

 int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param) {
  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
  return ElementDiv(tile_in0, tile_in1, out, size);
 }

 int ElementDiv(const float *in0, const float *in1, float *out, int size) {
  for (int i = 0; i < size; i++) {
    out[i] = in0[i] / in1[i];
  }
  return NNACL_OK;
 }

 int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
  for (int i = 0; i < size; i++) {
    float res = in0[i] / in1[i];
    out[i] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
  for (int i = 0; i < size; i++) {
    out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
  }
  return NNACL_OK;
 }

 int ElementFloorMod(const float *in0, const float *in1, float *out, int size) {
  for (int i = 0; i < size; i++) {
    out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i];
@@ -929,11 +123,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size)
  return NNACL_OK;
 }

 int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
  ElementSub(in0, in1, out, size);
  return ElementMul(out, out, out, size);
 }

 int ElementLogicalOr(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
--- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
+++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h
@@ -22,6 +22,11 @@
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/fp32/add_fp32.h"
 #include "nnacl/fp32/mul_fp32.h"
 #include "nnacl/fp32/div_fp32.h"
 #include "nnacl/fp32/sub_fp32.h"
 #include "nnacl/fp32/squared_difference.h"

 #ifdef __cplusplus
 extern "C" {
@@ -30,56 +35,6 @@ void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t n
                          const int *inStrides, const int *outStrides, const int *multiple);
 void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
                        ArithmeticParameter *param);

 /* Mul */
 int ElementMul(const float *in0, const float *in1, float *out, int size);
 int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
 int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementMulInt(const int *in0, const int *in1, int *out, int size);
 int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
 int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
 int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param);

 /* Add */
 int ElementAdd(const float *in0, const float *in1, float *out, int size);
 int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
 int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementAddInt(const int *in0, const int *in1, int *out, int size);
 int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param);

 /* Sub */
 int ElementSub(const float *in0, const float *in1, float *out, int size);
 int ElementSubInt(const int *in0, const int *in1, int *out, int size);
 int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
 int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);

 /* Div */
 int ElementDiv(const float *in0, const float *in1, float *out, int size);
 int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
 int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param);

 /* logical and */
 int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size);
 int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size);
@@ -88,9 +43,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size)
 /* logical or */
 int ElementLogicalOr(const float *in0, const float *in1, float *out, int size);

 /* Element Squared Difference */
 int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);

 /* max min */
 int ElementMaximum(const float *in0, const float *in1, float *out, int size);
 int ElementMinimum(const float *in0, const float *in1, float *out, int size);
--- a/mindspore/lite/nnacl/fp32/div_fp32.c
+++ b/mindspore/lite/nnacl/fp32/div_fp32.c
@@ -0,0 +1,107 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "nnacl/fp32/div_fp32.h"
 #include <math.h>
 #include "nnacl/fp32/arithmetic_fp32.h"

 int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = in0[0] / in1[index];
    }
  } else {
    if (in1[0] == 0) {
      return NNACL_ERRCODE_DIVISOR_ZERO;
    }
    for (int index = 0; index < size; index++) {
      out[index] = in0[index] / in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = in0[0] / in1[index];
      out[index] = out[index] > 0 ? out[index] : 0;
    }
  } else {
    for (int index = 0; index < size; index++) {
      out[index] = in0[index] / in1[0];
      out[index] = out[index] > 0 ? out[index] : 0;
    }
  }
  return NNACL_OK;
 }

 int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
    }
  } else {
    for (int index = 0; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }

 int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
  if (param->in_elements_num0_ == 1) {
    for (int index = 0; index < size; index++) {
      out[index] = in0[0] / in1[index];
    }
  } else {
    if (in1[0] == 0) {
      return NNACL_ERRCODE_DIVISOR_ZERO;
    }
    for (int index = 0; index < size; index++) {
      out[index] = in0[index] / in1[0];
    }
  }
  return NNACL_OK;
 }

 int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param) {
  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
  return ElementDiv(tile_in0, tile_in1, out, size);
 }

 int ElementDiv(const float *in0, const float *in1, float *out, int size) {
  for (int i = 0; i < size; i++) {
    out[i] = in0[i] / in1[i];
  }
  return NNACL_OK;
 }

 int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
  for (int i = 0; i < size; i++) {
    float res = in0[i] / in1[i];
    out[i] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
  for (int i = 0; i < size; i++) {
    out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
  }
  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp32/div_fp32.h
+++ b/mindspore/lite/nnacl/fp32/div_fp32.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_NNACL_FP32_DIV_H_
 #define MINDSPORE_LITE_NNACL_FP32_DIV_H_

 #ifdef ENABLE_NEON
 #include <arm_neon.h>
 #endif
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"

 #ifdef __cplusplus
 extern "C" {
 #endif
 int ElementDiv(const float *in0, const float *in1, float *out, int size);
 int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
 int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param);

 #ifdef __cplusplus
 }
 #endif

 #endif  // MINDSPORE_LITE_NNACL_FP32_DIV_H_
--- a/mindspore/lite/nnacl/fp32/lstm_fp32.c
+++ b/mindspore/lite/nnacl/fp32/lstm_fp32.c
@@ -19,6 +19,7 @@
 #include <float.h>
 #include "nnacl/fp32/activation_fp32.h"
 #include "nnacl/fp32/arithmetic_fp32.h"
 #include "nnacl/fp32/mul_fp32.h"

 void InitGate(float *gate_buffer, const float *bias, const LstmParameter *lstm_parm) {
  int gate_offest = 0;
--- a/mindspore/lite/nnacl/fp32/mul_fp32.c
+++ b/mindspore/lite/nnacl/fp32/mul_fp32.c
@@ -0,0 +1,327 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "nnacl/fp32/mul_fp32.h"
 #include "nnacl/fp32/arithmetic_fp32.h"

 int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param) {
  TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
  return ElementMul(tile_in0, tile_in1, out, size);
 }

 int ElementMul(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vmulq_f32(vin0, vin1);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] * in1[index];
  }
  return NNACL_OK;
 }

 int ElementMulRelu(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vmulq_f32(vin0, vin1);
    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] * in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
  }
  return NNACL_OK;
 }

 int ElementMulInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vmulq_s32(vin0, vin1);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] * in1[index];
  }
  return NNACL_OK;
 }

 int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  int32x4_t zeros = vdupq_n_s32(0);
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vmulq_s32(vin0, vin1);
    vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] * in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  int32x4_t zeros = vdupq_n_s32(0);
  int32x4_t bounds = vdupq_n_s32(6);
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
  }
  return NNACL_OK;
 }

 int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmulq_f32(vin0_opt, vin1);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] * in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmulq_f32(vin0, vin1_opt);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] * in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] * in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] * in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vmulq_s32(vin0_opt, vin1);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] * in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vmulq_s32(vin0, vin1_opt);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] * in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
  int32x4_t zeros = vdupq_n_s32(0);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] * in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] * in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
  int32x4_t zeros = vdupq_n_s32(0);
  int32x4_t bounds = vdupq_n_s32(6);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp32/mul_fp32.h
+++ b/mindspore/lite/nnacl/fp32/mul_fp32.h
@@ -0,0 +1,49 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_NNACL_FP32_MUL_H_
 #define MINDSPORE_LITE_NNACL_FP32_MUL_H_

 #ifdef ENABLE_NEON
 #include <arm_neon.h>
 #endif
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 int ElementMul(const float *in0, const float *in1, float *out, int size);
 int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
 int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementMulInt(const int *in0, const int *in1, int *out, int size);
 int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
 int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
 int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
 int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
                 ArithmeticParameter *param);

 #ifdef __cplusplus
 }
 #endif

 #endif  // MINDSPORE_LITE_NNACL_FP32_MUL_H_
--- a/mindspore/lite/nnacl/fp32/squared_difference.c
+++ b/mindspore/lite/nnacl/fp32/squared_difference.c
@@ -0,0 +1,28 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
 #define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_

 #include "nnacl/fp32/squared_difference.h"
 #include "nnacl/fp32/sub_fp32.h"
 #include "nnacl/fp32/mul_fp32.h"

 int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
  ElementSub(in0, in1, out, size);
  return ElementMul(out, out, out, size);
 }

 #endif  // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
--- a/mindspore/lite/nnacl/fp32/squared_difference.h
+++ b/mindspore/lite/nnacl/fp32/squared_difference.h
@@ -0,0 +1,37 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
 #define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_

 #ifdef ENABLE_NEON
 #include <arm_neon.h>
 #endif
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /* Element Squared Difference */
 int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);

 #ifdef __cplusplus
 }
 #endif

 #endif  // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
--- a/mindspore/lite/nnacl/fp32/sub_fp32.c
+++ b/mindspore/lite/nnacl/fp32/sub_fp32.c
@@ -0,0 +1,217 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "nnacl/fp32/sub_fp32.h"

 int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vsubq_f32(vin0_opt, vin1);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] - in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vsubq_f32(vin0, vin1_opt);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] - in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
  int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin1 = vld1q_s32(in1 + index);
      int32x4_t vout = vsubq_s32(vin0_opt, vin1);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[0] - in1[index];
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      int32x4_t vin0 = vld1q_s32(in0 + index);
      int32x4_t vout = vsubq_s32(vin0, vin1_opt);
      vst1q_s32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = in0[index] - in1[0];
    }
  }
  return NNACL_OK;
 }

 int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[0] - in1[index], 0);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMAX(in0[index] - in1[0], 0);
    }
  }
  return NNACL_OK;
 }

 int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
 #ifdef ENABLE_NEON
  float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
  float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
 #endif
  int index = 0;
  if (param->in_elements_num0_ == 1) {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin1 = vld1q_f32(in1 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
    }
  } else {
 #ifdef ENABLE_NEON
    for (; index <= size - 4; index += C4NUM) {
      float32x4_t vin0 = vld1q_f32(in0 + index);
      float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
      vst1q_f32(out + index, vout);
    }
 #endif
    for (; index < size; index++) {
      out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
    }
  }
  return NNACL_OK;
 }

 int ElementSub(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vsubq_f32(vin0, vin1);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] - in1[index];
  }
  return NNACL_OK;
 }

 int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  for (; index <= size - 4; index += C4NUM) {
    int32x4_t vin0 = vld1q_s32(in0 + index);
    int32x4_t vin1 = vld1q_s32(in1 + index);
    int32x4_t vout = vsubq_s32(vin0, vin1);
    vst1q_s32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = in0[index] - in1[index];
  }
  return NNACL_OK;
 }

 int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vsubq_f32(vin0, vin1);
    vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    float res = in0[index] - in1[index];
    out[index] = res > 0 ? res : 0;
  }
  return NNACL_OK;
 }

 int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
  int index = 0;
 #ifdef ENABLE_NEON
  float32x4_t zeros = vdupq_n_f32(0.0f);
  float32x4_t bounds = vdupq_n_f32(6.0f);
  for (; index <= size - 4; index += C4NUM) {
    float32x4_t vin0 = vld1q_f32(in0 + index);
    float32x4_t vin1 = vld1q_f32(in1 + index);
    float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
    vst1q_f32(out + index, vout);
  }
 #endif
  for (; index < size; index++) {
    out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
  }

  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp32/sub_fp32.h
+++ b/mindspore/lite/nnacl/fp32/sub_fp32.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_NNACL_SUB_FP32_H_
 #define MINDSPORE_LITE_NNACL_SUB_FP32_H_

 #ifdef ENABLE_NEON
 #include <arm_neon.h>
 #endif
 #include "nnacl/op_base.h"
 #include "nnacl/base/arithmetic_base.h"
 #include "nnacl/errorcode.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 int ElementSub(const float *in0, const float *in1, float *out, int size);
 int ElementSubInt(const int *in0, const int *in1, int *out, int size);
 int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
 int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
 int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
 int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);

 #ifdef __cplusplus
 }
 #endif

 #endif  // MINDSPORE_LITE_NNACL_SUB_FP32_H_