diff --git a/mindspore/lite/nnacl/fp32/add_fp32.c b/mindspore/lite/nnacl/fp32/add_fp32.c new file mode 100644 index 0000000000..11b839766a --- /dev/null +++ b/mindspore/lite/nnacl/fp32/add_fp32.c @@ -0,0 +1,225 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp32/add_fp32.h" +#include "nnacl/fp32/arithmetic_fp32.h" + +int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vaddq_f32(vin0_opt, vin1); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[0] + in1[index]; + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vaddq_f32(vin0, vin1_opt); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] + in1[0]; + } + } + return NNACL_OK; +} + +int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + int32x4_t vin0_opt = vdupq_n_s32(in0[0]); + int32x4_t vin1_opt = vdupq_n_s32(in1[0]); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vaddq_s32(vin0_opt, vin1); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[0] + in1[index]; + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vout = vaddq_s32(vin0, vin1_opt); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] + in1[0]; + } + } + return NNACL_OK; +} + +int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); + float32x4_t zeros = vdupq_n_f32(0.0f); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[0] + in1[index], 0); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[index] + in1[0], 0); + } + } + return NNACL_OK; +} + +int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); + float32x4_t zeros = vdupq_n_f32(0.0f); + float32x4_t bounds = vdupq_n_f32(6.0f); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6); + } + } + + return NNACL_OK; +} + +int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, + ArithmeticParameter *param) { + TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); + return ElementAdd(tile_in0, tile_in1, out, size); +} + +int ElementAdd(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vaddq_f32(vin0, vin1); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] + in1[index]; + } + return NNACL_OK; +} + +int ElementAddRelu(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + float32x4_t zeros = vdupq_n_f32(0.0f); + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vaddq_f32(vin0, vin1); + vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + float res = in0[index] + in1[index]; + out[index] = res > 0 ? res : 0; + } + return NNACL_OK; +} + +int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + float32x4_t zeros = vdupq_n_f32(0.0f); + float32x4_t bounds = vdupq_n_f32(6.0f); + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6); + } + return NNACL_OK; +} + +int ElementAddInt(const int *in0, const int *in1, int *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vaddq_s32(vin0, vin1); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] + in1[index]; + } + return NNACL_OK; +} diff --git a/mindspore/lite/nnacl/fp32/add_fp32.h b/mindspore/lite/nnacl/fp32/add_fp32.h new file mode 100644 index 0000000000..4344f33175 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/add_fp32.h @@ -0,0 +1,45 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_FP32_ADD_H_ +#define MINDSPORE_LITE_NNACL_FP32_ADD_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include "nnacl/op_base.h" +#include "nnacl/base/arithmetic_base.h" +#include "nnacl/errorcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int ElementAdd(const float *in0, const float *in1, float *out, int size); +int ElementAddRelu(const float *in0, const float *in1, float *out, int size); +int ElementAddRelu6(const float *in0, const float *in1, float *out, int size); +int ElementAddInt(const int *in0, const int *in1, int *out, int size); +int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); +int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, + ArithmeticParameter *param); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_FP32_ADD_H_ diff --git a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c index 73583bd4bc..cda6db6bce 100644 --- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.c +++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.c @@ -19,812 +19,6 @@ #define ACCURACY_DATA 0.00000001 -int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vmulq_f32(vin0_opt, vin1); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[0] * in1[index]; - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vmulq_f32(vin0, vin1_opt); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] * in1[0]; - } - } - return NNACL_OK; -} - -int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); - float32x4_t zeros = vdupq_n_f32(0.0f); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[0] * in1[index], 0); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[index] * in1[0], 0); - } - } - return NNACL_OK; -} - -int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); - float32x4_t zeros = vdupq_n_f32(0.0f); - float32x4_t bounds = vdupq_n_f32(6.0f); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); - } - } - return NNACL_OK; -} - -int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - int32x4_t vin0_opt = vdupq_n_s32(in0[0]); - int32x4_t vin1_opt = vdupq_n_s32(in1[0]); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vmulq_s32(vin0_opt, vin1); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[0] * in1[index]; - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vout = vmulq_s32(vin0, vin1_opt); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] * in1[0]; - } - } - return NNACL_OK; -} - -int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - int32x4_t vin0_opt = vdupq_n_s32(in0[0]); - int32x4_t vin1_opt = vdupq_n_s32(in1[0]); - int32x4_t zeros = vdupq_n_s32(0); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[0] * in1[index], 0); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[index] * in1[0], 0); - } - } - return NNACL_OK; -} - -int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - int32x4_t vin0_opt = vdupq_n_s32(in0[0]); - int32x4_t vin1_opt = vdupq_n_s32(in1[0]); - int32x4_t zeros = vdupq_n_s32(0); - int32x4_t bounds = vdupq_n_s32(6); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); - } - } - return NNACL_OK; -} - -int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vsubq_f32(vin0_opt, vin1); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[0] - in1[index]; - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vsubq_f32(vin0, vin1_opt); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] - in1[0]; - } - } - return NNACL_OK; -} - -int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - int32x4_t vin0_opt = vdupq_n_s32(in0[0]); - int32x4_t vin1_opt = vdupq_n_s32(in1[0]); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vsubq_s32(vin0_opt, vin1); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[0] - in1[index]; - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vout = vsubq_s32(vin0, vin1_opt); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] - in1[0]; - } - } - return NNACL_OK; -} - -int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); - float32x4_t zeros = vdupq_n_f32(0.0f); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[0] - in1[index], 0); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[index] - in1[0], 0); - } - } - return NNACL_OK; -} - -int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); - float32x4_t zeros = vdupq_n_f32(0.0f); - float32x4_t bounds = vdupq_n_f32(6.0f); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6); - } - } - return NNACL_OK; -} - -int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vaddq_f32(vin0_opt, vin1); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[0] + in1[index]; - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vaddq_f32(vin0, vin1_opt); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] + in1[0]; - } - } - return NNACL_OK; -} - -int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - int32x4_t vin0_opt = vdupq_n_s32(in0[0]); - int32x4_t vin1_opt = vdupq_n_s32(in1[0]); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vaddq_s32(vin0_opt, vin1); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[0] + in1[index]; - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vout = vaddq_s32(vin0, vin1_opt); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] + in1[0]; - } - } - return NNACL_OK; -} - -int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); - float32x4_t zeros = vdupq_n_f32(0.0f); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[0] + in1[index], 0); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMAX(in0[index] + in1[0], 0); - } - } - return NNACL_OK; -} - -int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { -#ifdef ENABLE_NEON - float32x4_t vin0_opt = vdupq_n_f32(in0[0]); - float32x4_t vin1_opt = vdupq_n_f32(in1[0]); - float32x4_t zeros = vdupq_n_f32(0.0f); - float32x4_t bounds = vdupq_n_f32(6.0f); -#endif - int index = 0; - if (param->in_elements_num0_ == 1) { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6); - } - } else { -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6); - } - } - - return NNACL_OK; -} - -int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { - if (param->in_elements_num0_ == 1) { - for (int index = 0; index < size; index++) { - out[index] = in0[0] / in1[index]; - } - } else { - if (in1[0] == 0) { - return NNACL_ERRCODE_DIVISOR_ZERO; - } - for (int index = 0; index < size; index++) { - out[index] = in0[index] / in1[0]; - } - } - return NNACL_OK; -} - -int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { - if (param->in_elements_num0_ == 1) { - for (int index = 0; index < size; index++) { - out[index] = in0[0] / in1[index]; - out[index] = out[index] > 0 ? out[index] : 0; - } - } else { - for (int index = 0; index < size; index++) { - out[index] = in0[index] / in1[0]; - out[index] = out[index] > 0 ? out[index] : 0; - } - } - return NNACL_OK; -} - -int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { - if (param->in_elements_num0_ == 1) { - for (int index = 0; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6); - } - } else { - for (int index = 0; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6); - } - } - return NNACL_OK; -} - -int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { - if (param->in_elements_num0_ == 1) { - for (int index = 0; index < size; index++) { - out[index] = in0[0] / in1[index]; - } - } else { - if (in1[0] == 0) { - return NNACL_ERRCODE_DIVISOR_ZERO; - } - for (int index = 0; index < size; index++) { - out[index] = in0[index] / in1[0]; - } - } - return NNACL_OK; -} - -int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, - ArithmeticParameter *param) { - TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); - return ElementAdd(tile_in0, tile_in1, out, size); -} - -int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, - ArithmeticParameter *param) { - TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); - return ElementMul(tile_in0, tile_in1, out, size); -} - -int ElementMul(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vmulq_f32(vin0, vin1); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] * in1[index]; - } - return NNACL_OK; -} - -int ElementMulRelu(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0.0f); - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vmulq_f32(vin0, vin1); - vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - float res = in0[index] * in1[index]; - out[index] = res > 0 ? res : 0; - } - return NNACL_OK; -} - -int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0.0f); - float32x4_t bounds = vdupq_n_f32(6.0f); - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); - } - return NNACL_OK; -} - -int ElementMulInt(const int *in0, const int *in1, int *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vmulq_s32(vin0, vin1); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] * in1[index]; - } - return NNACL_OK; -} - -int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - int32x4_t zeros = vdupq_n_s32(0); - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vmulq_s32(vin0, vin1); - vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - float res = in0[index] * in1[index]; - out[index] = res > 0 ? res : 0; - } - return NNACL_OK; -} - -int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - int32x4_t zeros = vdupq_n_s32(0); - int32x4_t bounds = vdupq_n_s32(6); - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); - } - return NNACL_OK; -} - -int ElementAdd(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vaddq_f32(vin0, vin1); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] + in1[index]; - } - return NNACL_OK; -} - -int ElementAddRelu(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0.0f); - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vaddq_f32(vin0, vin1); - vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - float res = in0[index] + in1[index]; - out[index] = res > 0 ? res : 0; - } - return NNACL_OK; -} - -int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0.0f); - float32x4_t bounds = vdupq_n_f32(6.0f); - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6); - } - return NNACL_OK; -} - -int ElementAddInt(const int *in0, const int *in1, int *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vaddq_s32(vin0, vin1); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] + in1[index]; - } - return NNACL_OK; -} - -int ElementSub(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vsubq_f32(vin0, vin1); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] - in1[index]; - } - return NNACL_OK; -} - -int ElementSubInt(const int *in0, const int *in1, int *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - for (; index <= size - 4; index += C4NUM) { - int32x4_t vin0 = vld1q_s32(in0 + index); - int32x4_t vin1 = vld1q_s32(in1 + index); - int32x4_t vout = vsubq_s32(vin0, vin1); - vst1q_s32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = in0[index] - in1[index]; - } - return NNACL_OK; -} - -int ElementSubRelu(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0.0f); - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vsubq_f32(vin0, vin1); - vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - float res = in0[index] - in1[index]; - out[index] = res > 0 ? res : 0; - } - return NNACL_OK; -} - -int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) { - int index = 0; -#ifdef ENABLE_NEON - float32x4_t zeros = vdupq_n_f32(0.0f); - float32x4_t bounds = vdupq_n_f32(6.0f); - for (; index <= size - 4; index += C4NUM) { - float32x4_t vin0 = vld1q_f32(in0 + index); - float32x4_t vin1 = vld1q_f32(in1 + index); - float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds); - vst1q_f32(out + index, vout); - } -#endif - for (; index < size; index++) { - out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6); - } - - return NNACL_OK; -} - -int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, - ArithmeticParameter *param) { - TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); - return ElementDiv(tile_in0, tile_in1, out, size); -} - -int ElementDiv(const float *in0, const float *in1, float *out, int size) { - for (int i = 0; i < size; i++) { - out[i] = in0[i] / in1[i]; - } - return NNACL_OK; -} - -int ElementDivRelu(const float *in0, const float *in1, float *out, int size) { - for (int i = 0; i < size; i++) { - float res = in0[i] / in1[i]; - out[i] = res > 0 ? res : 0; - } - return NNACL_OK; -} - -int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) { - for (int i = 0; i < size; i++) { - out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6); - } - return NNACL_OK; -} - int ElementFloorMod(const float *in0, const float *in1, float *out, int size) { for (int i = 0; i < size; i++) { out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i]; @@ -929,11 +123,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size) return NNACL_OK; } -int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) { - ElementSub(in0, in1, out, size); - return ElementMul(out, out, out, size); -} - int ElementLogicalOr(const float *in0, const float *in1, float *out, int size) { int index = 0; #ifdef ENABLE_NEON diff --git a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h index f076e40459..1580ea3b9f 100644 --- a/mindspore/lite/nnacl/fp32/arithmetic_fp32.h +++ b/mindspore/lite/nnacl/fp32/arithmetic_fp32.h @@ -22,6 +22,11 @@ #include "nnacl/op_base.h" #include "nnacl/base/arithmetic_base.h" #include "nnacl/errorcode.h" +#include "nnacl/fp32/add_fp32.h" +#include "nnacl/fp32/mul_fp32.h" +#include "nnacl/fp32/div_fp32.h" +#include "nnacl/fp32/sub_fp32.h" +#include "nnacl/fp32/squared_difference.h" #ifdef __cplusplus extern "C" { @@ -30,56 +35,6 @@ void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t n const int *inStrides, const int *outStrides, const int *multiple); void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1, ArithmeticParameter *param); - -/* Mul */ -int ElementMul(const float *in0, const float *in1, float *out, int size); -int ElementMulRelu(const float *in0, const float *in1, float *out, int size); -int ElementMulRelu6(const float *in0, const float *in1, float *out, int size); -int ElementMulInt(const int *in0, const int *in1, int *out, int size); -int ElementMulReluInt(const int *in0, const int *in1, int *out, int size); -int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size); -int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); -int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); -int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); -int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, - ArithmeticParameter *param); - -/* Add */ -int ElementAdd(const float *in0, const float *in1, float *out, int size); -int ElementAddRelu(const float *in0, const float *in1, float *out, int size); -int ElementAddRelu6(const float *in0, const float *in1, float *out, int size); -int ElementAddInt(const int *in0, const int *in1, int *out, int size); -int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); -int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, - ArithmeticParameter *param); - -/* Sub */ -int ElementSub(const float *in0, const float *in1, float *out, int size); -int ElementSubInt(const int *in0, const int *in1, int *out, int size); -int ElementSubRelu(const float *in0, const float *in1, float *out, int size); -int ElementSubRelu6(const float *in0, const float *in1, float *out, int size); -int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); - -/* Div */ -int ElementDiv(const float *in0, const float *in1, float *out, int size); -int ElementDivRelu(const float *in0, const float *in1, float *out, int size); -int ElementDivRelu6(const float *in0, const float *in1, float *out, int size); -int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); -int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); -int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, - ArithmeticParameter *param); - /* logical and */ int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size); int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size); @@ -88,9 +43,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size) /* logical or */ int ElementLogicalOr(const float *in0, const float *in1, float *out, int size); -/* Element Squared Difference */ -int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size); - /* max min */ int ElementMaximum(const float *in0, const float *in1, float *out, int size); int ElementMinimum(const float *in0, const float *in1, float *out, int size); diff --git a/mindspore/lite/nnacl/fp32/div_fp32.c b/mindspore/lite/nnacl/fp32/div_fp32.c new file mode 100644 index 0000000000..9d1f665ea1 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/div_fp32.c @@ -0,0 +1,107 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp32/div_fp32.h" +#include +#include "nnacl/fp32/arithmetic_fp32.h" + +int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { + if (param->in_elements_num0_ == 1) { + for (int index = 0; index < size; index++) { + out[index] = in0[0] / in1[index]; + } + } else { + if (in1[0] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + for (int index = 0; index < size; index++) { + out[index] = in0[index] / in1[0]; + } + } + return NNACL_OK; +} + +int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { + if (param->in_elements_num0_ == 1) { + for (int index = 0; index < size; index++) { + out[index] = in0[0] / in1[index]; + out[index] = out[index] > 0 ? out[index] : 0; + } + } else { + for (int index = 0; index < size; index++) { + out[index] = in0[index] / in1[0]; + out[index] = out[index] > 0 ? out[index] : 0; + } + } + return NNACL_OK; +} + +int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { + if (param->in_elements_num0_ == 1) { + for (int index = 0; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6); + } + } else { + for (int index = 0; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6); + } + } + return NNACL_OK; +} + +int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { + if (param->in_elements_num0_ == 1) { + for (int index = 0; index < size; index++) { + out[index] = in0[0] / in1[index]; + } + } else { + if (in1[0] == 0) { + return NNACL_ERRCODE_DIVISOR_ZERO; + } + for (int index = 0; index < size; index++) { + out[index] = in0[index] / in1[0]; + } + } + return NNACL_OK; +} + +int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, + ArithmeticParameter *param) { + TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); + return ElementDiv(tile_in0, tile_in1, out, size); +} + +int ElementDiv(const float *in0, const float *in1, float *out, int size) { + for (int i = 0; i < size; i++) { + out[i] = in0[i] / in1[i]; + } + return NNACL_OK; +} + +int ElementDivRelu(const float *in0, const float *in1, float *out, int size) { + for (int i = 0; i < size; i++) { + float res = in0[i] / in1[i]; + out[i] = res > 0 ? res : 0; + } + return NNACL_OK; +} + +int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) { + for (int i = 0; i < size; i++) { + out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6); + } + return NNACL_OK; +} diff --git a/mindspore/lite/nnacl/fp32/div_fp32.h b/mindspore/lite/nnacl/fp32/div_fp32.h new file mode 100644 index 0000000000..755d678d55 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/div_fp32.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_FP32_DIV_H_ +#define MINDSPORE_LITE_NNACL_FP32_DIV_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include "nnacl/op_base.h" +#include "nnacl/base/arithmetic_base.h" +#include "nnacl/errorcode.h" + +#ifdef __cplusplus +extern "C" { +#endif +int ElementDiv(const float *in0, const float *in1, float *out, int size); +int ElementDivRelu(const float *in0, const float *in1, float *out, int size); +int ElementDivRelu6(const float *in0, const float *in1, float *out, int size); +int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); +int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, + ArithmeticParameter *param); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_FP32_DIV_H_ diff --git a/mindspore/lite/nnacl/fp32/lstm_fp32.c b/mindspore/lite/nnacl/fp32/lstm_fp32.c index fd615201bd..7c88d8707e 100644 --- a/mindspore/lite/nnacl/fp32/lstm_fp32.c +++ b/mindspore/lite/nnacl/fp32/lstm_fp32.c @@ -19,6 +19,7 @@ #include #include "nnacl/fp32/activation_fp32.h" #include "nnacl/fp32/arithmetic_fp32.h" +#include "nnacl/fp32/mul_fp32.h" void InitGate(float *gate_buffer, const float *bias, const LstmParameter *lstm_parm) { int gate_offest = 0; diff --git a/mindspore/lite/nnacl/fp32/mul_fp32.c b/mindspore/lite/nnacl/fp32/mul_fp32.c new file mode 100644 index 0000000000..4fea298205 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/mul_fp32.c @@ -0,0 +1,327 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp32/mul_fp32.h" +#include "nnacl/fp32/arithmetic_fp32.h" + +int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, + ArithmeticParameter *param) { + TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); + return ElementMul(tile_in0, tile_in1, out, size); +} + +int ElementMul(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vmulq_f32(vin0, vin1); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] * in1[index]; + } + return NNACL_OK; +} + +int ElementMulRelu(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + float32x4_t zeros = vdupq_n_f32(0.0f); + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vmulq_f32(vin0, vin1); + vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + float res = in0[index] * in1[index]; + out[index] = res > 0 ? res : 0; + } + return NNACL_OK; +} + +int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + float32x4_t zeros = vdupq_n_f32(0.0f); + float32x4_t bounds = vdupq_n_f32(6.0f); + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); + } + return NNACL_OK; +} + +int ElementMulInt(const int *in0, const int *in1, int *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vmulq_s32(vin0, vin1); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] * in1[index]; + } + return NNACL_OK; +} + +int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + int32x4_t zeros = vdupq_n_s32(0); + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vmulq_s32(vin0, vin1); + vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + float res = in0[index] * in1[index]; + out[index] = res > 0 ? res : 0; + } + return NNACL_OK; +} + +int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + int32x4_t zeros = vdupq_n_s32(0); + int32x4_t bounds = vdupq_n_s32(6); + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); + } + return NNACL_OK; +} + +int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vmulq_f32(vin0_opt, vin1); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[0] * in1[index]; + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vmulq_f32(vin0, vin1_opt); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] * in1[0]; + } + } + return NNACL_OK; +} + +int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); + float32x4_t zeros = vdupq_n_f32(0.0f); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[0] * in1[index], 0); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[index] * in1[0], 0); + } + } + return NNACL_OK; +} + +int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); + float32x4_t zeros = vdupq_n_f32(0.0f); + float32x4_t bounds = vdupq_n_f32(6.0f); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); + } + } + return NNACL_OK; +} + +int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + int32x4_t vin0_opt = vdupq_n_s32(in0[0]); + int32x4_t vin1_opt = vdupq_n_s32(in1[0]); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vmulq_s32(vin0_opt, vin1); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[0] * in1[index]; + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vout = vmulq_s32(vin0, vin1_opt); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] * in1[0]; + } + } + return NNACL_OK; +} + +int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + int32x4_t vin0_opt = vdupq_n_s32(in0[0]); + int32x4_t vin1_opt = vdupq_n_s32(in1[0]); + int32x4_t zeros = vdupq_n_s32(0); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[0] * in1[index], 0); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[index] * in1[0], 0); + } + } + return NNACL_OK; +} + +int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + int32x4_t vin0_opt = vdupq_n_s32(in0[0]); + int32x4_t vin1_opt = vdupq_n_s32(in1[0]); + int32x4_t zeros = vdupq_n_s32(0); + int32x4_t bounds = vdupq_n_s32(6); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); + } + } + return NNACL_OK; +} diff --git a/mindspore/lite/nnacl/fp32/mul_fp32.h b/mindspore/lite/nnacl/fp32/mul_fp32.h new file mode 100644 index 0000000000..0ff54a43c8 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/mul_fp32.h @@ -0,0 +1,49 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_FP32_MUL_H_ +#define MINDSPORE_LITE_NNACL_FP32_MUL_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include "nnacl/op_base.h" +#include "nnacl/base/arithmetic_base.h" +#include "nnacl/errorcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int ElementMul(const float *in0, const float *in1, float *out, int size); +int ElementMulRelu(const float *in0, const float *in1, float *out, int size); +int ElementMulRelu6(const float *in0, const float *in1, float *out, int size); +int ElementMulInt(const int *in0, const int *in1, int *out, int size); +int ElementMulReluInt(const int *in0, const int *in1, int *out, int size); +int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size); +int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); +int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); +int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); +int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, + ArithmeticParameter *param); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_FP32_MUL_H_ diff --git a/mindspore/lite/nnacl/fp32/squared_difference.c b/mindspore/lite/nnacl/fp32/squared_difference.c new file mode 100644 index 0000000000..0340329009 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/squared_difference.c @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ +#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ + +#include "nnacl/fp32/squared_difference.h" +#include "nnacl/fp32/sub_fp32.h" +#include "nnacl/fp32/mul_fp32.h" + +int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) { + ElementSub(in0, in1, out, size); + return ElementMul(out, out, out, size); +} + +#endif // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ diff --git a/mindspore/lite/nnacl/fp32/squared_difference.h b/mindspore/lite/nnacl/fp32/squared_difference.h new file mode 100644 index 0000000000..71098b5525 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/squared_difference.h @@ -0,0 +1,37 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ +#define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include "nnacl/op_base.h" +#include "nnacl/base/arithmetic_base.h" +#include "nnacl/errorcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Element Squared Difference */ +int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ diff --git a/mindspore/lite/nnacl/fp32/sub_fp32.c b/mindspore/lite/nnacl/fp32/sub_fp32.c new file mode 100644 index 0000000000..e2dfcb7fdb --- /dev/null +++ b/mindspore/lite/nnacl/fp32/sub_fp32.c @@ -0,0 +1,217 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nnacl/fp32/sub_fp32.h" + +int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vsubq_f32(vin0_opt, vin1); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[0] - in1[index]; + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vsubq_f32(vin0, vin1_opt); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] - in1[0]; + } + } + return NNACL_OK; +} + +int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + int32x4_t vin0_opt = vdupq_n_s32(in0[0]); + int32x4_t vin1_opt = vdupq_n_s32(in1[0]); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vsubq_s32(vin0_opt, vin1); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[0] - in1[index]; + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vout = vsubq_s32(vin0, vin1_opt); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] - in1[0]; + } + } + return NNACL_OK; +} + +int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); + float32x4_t zeros = vdupq_n_f32(0.0f); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[0] - in1[index], 0); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMAX(in0[index] - in1[0], 0); + } + } + return NNACL_OK; +} + +int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { +#ifdef ENABLE_NEON + float32x4_t vin0_opt = vdupq_n_f32(in0[0]); + float32x4_t vin1_opt = vdupq_n_f32(in1[0]); + float32x4_t zeros = vdupq_n_f32(0.0f); + float32x4_t bounds = vdupq_n_f32(6.0f); +#endif + int index = 0; + if (param->in_elements_num0_ == 1) { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6); + } + } else { +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6); + } + } + return NNACL_OK; +} + +int ElementSub(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vsubq_f32(vin0, vin1); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] - in1[index]; + } + return NNACL_OK; +} + +int ElementSubInt(const int *in0, const int *in1, int *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + for (; index <= size - 4; index += C4NUM) { + int32x4_t vin0 = vld1q_s32(in0 + index); + int32x4_t vin1 = vld1q_s32(in1 + index); + int32x4_t vout = vsubq_s32(vin0, vin1); + vst1q_s32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = in0[index] - in1[index]; + } + return NNACL_OK; +} + +int ElementSubRelu(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + float32x4_t zeros = vdupq_n_f32(0.0f); + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vsubq_f32(vin0, vin1); + vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + float res = in0[index] - in1[index]; + out[index] = res > 0 ? res : 0; + } + return NNACL_OK; +} + +int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) { + int index = 0; +#ifdef ENABLE_NEON + float32x4_t zeros = vdupq_n_f32(0.0f); + float32x4_t bounds = vdupq_n_f32(6.0f); + for (; index <= size - 4; index += C4NUM) { + float32x4_t vin0 = vld1q_f32(in0 + index); + float32x4_t vin1 = vld1q_f32(in1 + index); + float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds); + vst1q_f32(out + index, vout); + } +#endif + for (; index < size; index++) { + out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6); + } + + return NNACL_OK; +} diff --git a/mindspore/lite/nnacl/fp32/sub_fp32.h b/mindspore/lite/nnacl/fp32/sub_fp32.h new file mode 100644 index 0000000000..b846417190 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/sub_fp32.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_SUB_FP32_H_ +#define MINDSPORE_LITE_NNACL_SUB_FP32_H_ + +#ifdef ENABLE_NEON +#include +#endif +#include "nnacl/op_base.h" +#include "nnacl/base/arithmetic_base.h" +#include "nnacl/errorcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int ElementSub(const float *in0, const float *in1, float *out, int size); +int ElementSubInt(const int *in0, const int *in1, int *out, int size); +int ElementSubRelu(const float *in0, const float *in1, float *out, int size); +int ElementSubRelu6(const float *in0, const float *in1, float *out, int size); +int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); +int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_SUB_FP32_H_