Browse Source

move mul/add/div/sub from arithmetic_fp32

tags/v1.2.0-rc1
smurf chengyuanwang 4 years ago
parent
commit
fe90be3bb4
13 changed files with 1127 additions and 864 deletions
  1. +225
    -0
      mindspore/lite/nnacl/fp32/add_fp32.c
  2. +45
    -0
      mindspore/lite/nnacl/fp32/add_fp32.h
  3. +0
    -811
      mindspore/lite/nnacl/fp32/arithmetic_fp32.c
  4. +5
    -53
      mindspore/lite/nnacl/fp32/arithmetic_fp32.h
  5. +107
    -0
      mindspore/lite/nnacl/fp32/div_fp32.c
  6. +43
    -0
      mindspore/lite/nnacl/fp32/div_fp32.h
  7. +1
    -0
      mindspore/lite/nnacl/fp32/lstm_fp32.c
  8. +327
    -0
      mindspore/lite/nnacl/fp32/mul_fp32.c
  9. +49
    -0
      mindspore/lite/nnacl/fp32/mul_fp32.h
  10. +28
    -0
      mindspore/lite/nnacl/fp32/squared_difference.c
  11. +37
    -0
      mindspore/lite/nnacl/fp32/squared_difference.h
  12. +217
    -0
      mindspore/lite/nnacl/fp32/sub_fp32.c
  13. +43
    -0
      mindspore/lite/nnacl/fp32/sub_fp32.h

+ 225
- 0
mindspore/lite/nnacl/fp32/add_fp32.c View File

@@ -0,0 +1,225 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nnacl/fp32/add_fp32.h"
#include "nnacl/fp32/arithmetic_fp32.h"

int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vaddq_f32(vin0_opt, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] + in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vaddq_f32(vin0, vin1_opt);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[0];
}
}
return NNACL_OK;
}

int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vaddq_s32(vin0_opt, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] + in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vaddq_s32(vin0, vin1_opt);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[0];
}
}
return NNACL_OK;
}

int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] + in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] + in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
}
}

return NNACL_OK;
}

int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param) {
TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
return ElementAdd(tile_in0, tile_in1, out, size);
}

int ElementAdd(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vaddq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[index];
}
return NNACL_OK;
}

int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vaddq_f32(vin0, vin1);
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] + in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
}
return NNACL_OK;
}

int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vaddq_s32(vin0, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[index];
}
return NNACL_OK;
}

+ 45
- 0
mindspore/lite/nnacl/fp32/add_fp32.h View File

@@ -0,0 +1,45 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_FP32_ADD_H_
#define MINDSPORE_LITE_NNACL_FP32_ADD_H_

#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include "nnacl/op_base.h"
#include "nnacl/base/arithmetic_base.h"
#include "nnacl/errorcode.h"

#ifdef __cplusplus
extern "C" {
#endif

int ElementAdd(const float *in0, const float *in1, float *out, int size);
int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
int ElementAddInt(const int *in0, const int *in1, int *out, int size);
int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param);

#ifdef __cplusplus
}
#endif

#endif // MINDSPORE_LITE_NNACL_FP32_ADD_H_

+ 0
- 811
mindspore/lite/nnacl/fp32/arithmetic_fp32.c View File

@@ -19,812 +19,6 @@

#define ACCURACY_DATA 0.00000001

int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmulq_f32(vin0_opt, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] * in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmulq_f32(vin0, vin1_opt);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[0];
}
}
return NNACL_OK;
}

int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] * in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] * in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
}
}
return NNACL_OK;
}

int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmulq_s32(vin0_opt, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] * in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vmulq_s32(vin0, vin1_opt);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[0];
}
}
return NNACL_OK;
}

int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
int32x4_t zeros = vdupq_n_s32(0);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] * in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] * in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
int32x4_t zeros = vdupq_n_s32(0);
int32x4_t bounds = vdupq_n_s32(6);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
}
}
return NNACL_OK;
}

int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vsubq_f32(vin0_opt, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] - in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vsubq_f32(vin0, vin1_opt);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[0];
}
}
return NNACL_OK;
}

int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vsubq_s32(vin0_opt, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] - in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vsubq_s32(vin0, vin1_opt);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[0];
}
}
return NNACL_OK;
}

int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] - in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] - in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
}
}
return NNACL_OK;
}

int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vaddq_f32(vin0_opt, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] + in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vaddq_f32(vin0, vin1_opt);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[0];
}
}
return NNACL_OK;
}

int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vaddq_s32(vin0_opt, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] + in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vaddq_s32(vin0, vin1_opt);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[0];
}
}
return NNACL_OK;
}

int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] + in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] + in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6);
}
}

return NNACL_OK;
}

int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = in0[0] / in1[index];
}
} else {
if (in1[0] == 0) {
return NNACL_ERRCODE_DIVISOR_ZERO;
}
for (int index = 0; index < size; index++) {
out[index] = in0[index] / in1[0];
}
}
return NNACL_OK;
}

int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = in0[0] / in1[index];
out[index] = out[index] > 0 ? out[index] : 0;
}
} else {
for (int index = 0; index < size; index++) {
out[index] = in0[index] / in1[0];
out[index] = out[index] > 0 ? out[index] : 0;
}
}
return NNACL_OK;
}

int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
}
} else {
for (int index = 0; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
}
}
return NNACL_OK;
}

int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = in0[0] / in1[index];
}
} else {
if (in1[0] == 0) {
return NNACL_ERRCODE_DIVISOR_ZERO;
}
for (int index = 0; index < size; index++) {
out[index] = in0[index] / in1[0];
}
}
return NNACL_OK;
}

int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param) {
TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
return ElementAdd(tile_in0, tile_in1, out, size);
}

int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param) {
TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
return ElementMul(tile_in0, tile_in1, out, size);
}

int ElementMul(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmulq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[index];
}
return NNACL_OK;
}

int ElementMulRelu(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmulq_f32(vin0, vin1);
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] * in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
}
return NNACL_OK;
}

int ElementMulInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmulq_s32(vin0, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[index];
}
return NNACL_OK;
}

int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
int32x4_t zeros = vdupq_n_s32(0);
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmulq_s32(vin0, vin1);
vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] * in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
int32x4_t zeros = vdupq_n_s32(0);
int32x4_t bounds = vdupq_n_s32(6);
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
}
return NNACL_OK;
}

int ElementAdd(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vaddq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[index];
}
return NNACL_OK;
}

int ElementAddRelu(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vaddq_f32(vin0, vin1);
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] + in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6);
}
return NNACL_OK;
}

int ElementAddInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vaddq_s32(vin0, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] + in1[index];
}
return NNACL_OK;
}

int ElementSub(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vsubq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[index];
}
return NNACL_OK;
}

int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vsubq_s32(vin0, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[index];
}
return NNACL_OK;
}

int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vsubq_f32(vin0, vin1);
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] - in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
}

return NNACL_OK;
}

int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param) {
TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
return ElementDiv(tile_in0, tile_in1, out, size);
}

int ElementDiv(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) {
out[i] = in0[i] / in1[i];
}
return NNACL_OK;
}

int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) {
float res = in0[i] / in1[i];
out[i] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) {
out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
}
return NNACL_OK;
}

int ElementFloorMod(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) {
out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i];
@@ -929,11 +123,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size)
return NNACL_OK;
}

int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
ElementSub(in0, in1, out, size);
return ElementMul(out, out, out, size);
}

int ElementLogicalOr(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON


+ 5
- 53
mindspore/lite/nnacl/fp32/arithmetic_fp32.h View File

@@ -22,6 +22,11 @@
#include "nnacl/op_base.h"
#include "nnacl/base/arithmetic_base.h"
#include "nnacl/errorcode.h"
#include "nnacl/fp32/add_fp32.h"
#include "nnacl/fp32/mul_fp32.h"
#include "nnacl/fp32/div_fp32.h"
#include "nnacl/fp32/sub_fp32.h"
#include "nnacl/fp32/squared_difference.h"

#ifdef __cplusplus
extern "C" {
@@ -30,56 +35,6 @@ void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t n
const int *inStrides, const int *outStrides, const int *multiple);
void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1,
ArithmeticParameter *param);

/* Mul */
int ElementMul(const float *in0, const float *in1, float *out, int size);
int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
int ElementMulInt(const int *in0, const int *in1, int *out, int size);
int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param);

/* Add */
int ElementAdd(const float *in0, const float *in1, float *out, int size);
int ElementAddRelu(const float *in0, const float *in1, float *out, int size);
int ElementAddRelu6(const float *in0, const float *in1, float *out, int size);
int ElementAddInt(const int *in0, const int *in1, int *out, int size);
int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param);

/* Sub */
int ElementSub(const float *in0, const float *in1, float *out, int size);
int ElementSubInt(const int *in0, const int *in1, int *out, int size);
int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);

/* Div */
int ElementDiv(const float *in0, const float *in1, float *out, int size);
int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param);

/* logical and */
int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size);
int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size);
@@ -88,9 +43,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size)
/* logical or */
int ElementLogicalOr(const float *in0, const float *in1, float *out, int size);

/* Element Squared Difference */
int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);

/* max min */
int ElementMaximum(const float *in0, const float *in1, float *out, int size);
int ElementMinimum(const float *in0, const float *in1, float *out, int size);


+ 107
- 0
mindspore/lite/nnacl/fp32/div_fp32.c View File

@@ -0,0 +1,107 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nnacl/fp32/div_fp32.h"
#include <math.h>
#include "nnacl/fp32/arithmetic_fp32.h"

int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = in0[0] / in1[index];
}
} else {
if (in1[0] == 0) {
return NNACL_ERRCODE_DIVISOR_ZERO;
}
for (int index = 0; index < size; index++) {
out[index] = in0[index] / in1[0];
}
}
return NNACL_OK;
}

int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = in0[0] / in1[index];
out[index] = out[index] > 0 ? out[index] : 0;
}
} else {
for (int index = 0; index < size; index++) {
out[index] = in0[index] / in1[0];
out[index] = out[index] > 0 ? out[index] : 0;
}
}
return NNACL_OK;
}

int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6);
}
} else {
for (int index = 0; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6);
}
}
return NNACL_OK;
}

int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
if (param->in_elements_num0_ == 1) {
for (int index = 0; index < size; index++) {
out[index] = in0[0] / in1[index];
}
} else {
if (in1[0] == 0) {
return NNACL_ERRCODE_DIVISOR_ZERO;
}
for (int index = 0; index < size; index++) {
out[index] = in0[index] / in1[0];
}
}
return NNACL_OK;
}

int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param) {
TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
return ElementDiv(tile_in0, tile_in1, out, size);
}

int ElementDiv(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) {
out[i] = in0[i] / in1[i];
}
return NNACL_OK;
}

int ElementDivRelu(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) {
float res = in0[i] / in1[i];
out[i] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) {
for (int i = 0; i < size; i++) {
out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6);
}
return NNACL_OK;
}

+ 43
- 0
mindspore/lite/nnacl/fp32/div_fp32.h View File

@@ -0,0 +1,43 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_FP32_DIV_H_
#define MINDSPORE_LITE_NNACL_FP32_DIV_H_

#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include "nnacl/op_base.h"
#include "nnacl/base/arithmetic_base.h"
#include "nnacl/errorcode.h"

#ifdef __cplusplus
extern "C" {
#endif
int ElementDiv(const float *in0, const float *in1, float *out, int size);
int ElementDivRelu(const float *in0, const float *in1, float *out, int size);
int ElementDivRelu6(const float *in0, const float *in1, float *out, int size);
int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param);

#ifdef __cplusplus
}
#endif

#endif // MINDSPORE_LITE_NNACL_FP32_DIV_H_

+ 1
- 0
mindspore/lite/nnacl/fp32/lstm_fp32.c View File

@@ -19,6 +19,7 @@
#include <float.h>
#include "nnacl/fp32/activation_fp32.h"
#include "nnacl/fp32/arithmetic_fp32.h"
#include "nnacl/fp32/mul_fp32.h"

void InitGate(float *gate_buffer, const float *bias, const LstmParameter *lstm_parm) {
int gate_offest = 0;


+ 327
- 0
mindspore/lite/nnacl/fp32/mul_fp32.c View File

@@ -0,0 +1,327 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp32/mul_fp32.h"
#include "nnacl/fp32/arithmetic_fp32.h"

int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param) {
TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param);
return ElementMul(tile_in0, tile_in1, out, size);
}

int ElementMul(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmulq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[index];
}
return NNACL_OK;
}

int ElementMulRelu(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmulq_f32(vin0, vin1);
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] * in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
}
return NNACL_OK;
}

int ElementMulInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmulq_s32(vin0, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[index];
}
return NNACL_OK;
}

int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
int32x4_t zeros = vdupq_n_s32(0);
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmulq_s32(vin0, vin1);
vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] * in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
int32x4_t zeros = vdupq_n_s32(0);
int32x4_t bounds = vdupq_n_s32(6);
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6);
}
return NNACL_OK;
}

int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmulq_f32(vin0_opt, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] * in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmulq_f32(vin0, vin1_opt);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[0];
}
}
return NNACL_OK;
}

int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] * in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] * in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
}
}
return NNACL_OK;
}

int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmulq_s32(vin0_opt, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] * in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vmulq_s32(vin0, vin1_opt);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] * in1[0];
}
}
return NNACL_OK;
}

int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
int32x4_t zeros = vdupq_n_s32(0);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] * in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] * in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
int32x4_t zeros = vdupq_n_s32(0);
int32x4_t bounds = vdupq_n_s32(6);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6);
}
}
return NNACL_OK;
}

+ 49
- 0
mindspore/lite/nnacl/fp32/mul_fp32.h View File

@@ -0,0 +1,49 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_FP32_MUL_H_
#define MINDSPORE_LITE_NNACL_FP32_MUL_H_

#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include "nnacl/op_base.h"
#include "nnacl/base/arithmetic_base.h"
#include "nnacl/errorcode.h"

#ifdef __cplusplus
extern "C" {
#endif

int ElementMul(const float *in0, const float *in1, float *out, int size);
int ElementMulRelu(const float *in0, const float *in1, float *out, int size);
int ElementMulRelu6(const float *in0, const float *in1, float *out, int size);
int ElementMulInt(const int *in0, const int *in1, int *out, int size);
int ElementMulReluInt(const int *in0, const int *in1, int *out, int size);
int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size);
int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);
int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size,
ArithmeticParameter *param);

#ifdef __cplusplus
}
#endif

#endif // MINDSPORE_LITE_NNACL_FP32_MUL_H_

+ 28
- 0
mindspore/lite/nnacl/fp32/squared_difference.c View File

@@ -0,0 +1,28 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_

#include "nnacl/fp32/squared_difference.h"
#include "nnacl/fp32/sub_fp32.h"
#include "nnacl/fp32/mul_fp32.h"

int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) {
ElementSub(in0, in1, out, size);
return ElementMul(out, out, out, size);
}

#endif // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_

+ 37
- 0
mindspore/lite/nnacl/fp32/squared_difference.h View File

@@ -0,0 +1,37 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_
#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_

#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include "nnacl/op_base.h"
#include "nnacl/base/arithmetic_base.h"
#include "nnacl/errorcode.h"

#ifdef __cplusplus
extern "C" {
#endif

/* Element Squared Difference */
int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size);

#ifdef __cplusplus
}
#endif

#endif // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_

+ 217
- 0
mindspore/lite/nnacl/fp32/sub_fp32.c View File

@@ -0,0 +1,217 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fp32/sub_fp32.h"

int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vsubq_f32(vin0_opt, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] - in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vsubq_f32(vin0, vin1_opt);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[0];
}
}
return NNACL_OK;
}

int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
int32x4_t vin0_opt = vdupq_n_s32(in0[0]);
int32x4_t vin1_opt = vdupq_n_s32(in1[0]);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vsubq_s32(vin0_opt, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[0] - in1[index];
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vout = vsubq_s32(vin0, vin1_opt);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[0];
}
}
return NNACL_OK;
}

int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[0] - in1[index], 0);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMAX(in0[index] - in1[0], 0);
}
}
return NNACL_OK;
}

int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) {
#ifdef ENABLE_NEON
float32x4_t vin0_opt = vdupq_n_f32(in0[0]);
float32x4_t vin1_opt = vdupq_n_f32(in1[0]);
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
#endif
int index = 0;
if (param->in_elements_num0_ == 1) {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6);
}
} else {
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6);
}
}
return NNACL_OK;
}

int ElementSub(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vsubq_f32(vin0, vin1);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[index];
}
return NNACL_OK;
}

int ElementSubInt(const int *in0, const int *in1, int *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
for (; index <= size - 4; index += C4NUM) {
int32x4_t vin0 = vld1q_s32(in0 + index);
int32x4_t vin1 = vld1q_s32(in1 + index);
int32x4_t vout = vsubq_s32(vin0, vin1);
vst1q_s32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = in0[index] - in1[index];
}
return NNACL_OK;
}

int ElementSubRelu(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vsubq_f32(vin0, vin1);
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
float res = in0[index] - in1[index];
out[index] = res > 0 ? res : 0;
}
return NNACL_OK;
}

int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) {
int index = 0;
#ifdef ENABLE_NEON
float32x4_t zeros = vdupq_n_f32(0.0f);
float32x4_t bounds = vdupq_n_f32(6.0f);
for (; index <= size - 4; index += C4NUM) {
float32x4_t vin0 = vld1q_f32(in0 + index);
float32x4_t vin1 = vld1q_f32(in1 + index);
float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
vst1q_f32(out + index, vout);
}
#endif
for (; index < size; index++) {
out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6);
}

return NNACL_OK;
}

+ 43
- 0
mindspore/lite/nnacl/fp32/sub_fp32.h View File

@@ -0,0 +1,43 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_NNACL_SUB_FP32_H_
#define MINDSPORE_LITE_NNACL_SUB_FP32_H_

#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include "nnacl/op_base.h"
#include "nnacl/base/arithmetic_base.h"
#include "nnacl/errorcode.h"

#ifdef __cplusplus
extern "C" {
#endif

int ElementSub(const float *in0, const float *in1, float *out, int size);
int ElementSubInt(const int *in0, const int *in1, int *out, int size);
int ElementSubRelu(const float *in0, const float *in1, float *out, int size);
int ElementSubRelu6(const float *in0, const float *in1, float *out, int size);
int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param);
int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param);

#ifdef __cplusplus
}
#endif

#endif // MINDSPORE_LITE_NNACL_SUB_FP32_H_

Loading…
Cancel
Save