| @@ -0,0 +1,225 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "nnacl/fp32/add_fp32.h" | |||||
| #include "nnacl/fp32/arithmetic_fp32.h" | |||||
| int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0_opt, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] + in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0, vin1_opt); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0_opt, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] + in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0, vin1_opt); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] + in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] + in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param) { | |||||
| TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); | |||||
| return ElementAdd(tile_in0, tile_in1, out, size); | |||||
| } | |||||
| int ElementAdd(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementAddRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0, vin1); | |||||
| vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] + in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementAddInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| @@ -0,0 +1,45 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_FP32_ADD_H_ | |||||
| #define MINDSPORE_LITE_NNACL_FP32_ADD_H_ | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/base/arithmetic_base.h" | |||||
| #include "nnacl/errorcode.h" | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| int ElementAdd(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementAddRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementAddRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementAddInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_FP32_ADD_H_ | |||||
| @@ -19,812 +19,6 @@ | |||||
| #define ACCURACY_DATA 0.00000001 | #define ACCURACY_DATA 0.00000001 | ||||
| int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0_opt, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] * in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0, vin1_opt); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] * in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] * in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0_opt, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] * in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0, vin1_opt); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] * in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] * in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| int32x4_t bounds = vdupq_n_s32(6); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0_opt, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] - in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0, vin1_opt); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vsubq_s32(vin0_opt, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] - in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vsubq_s32(vin0, vin1_opt); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] - in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] - in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0_opt, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] + in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0, vin1_opt); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0_opt, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] + in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0, vin1_opt); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] + in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] + in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] + in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] + in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[0] / in1[index]; | |||||
| } | |||||
| } else { | |||||
| if (in1[0] == 0) { | |||||
| return NNACL_ERRCODE_DIVISOR_ZERO; | |||||
| } | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[index] / in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[0] / in1[index]; | |||||
| out[index] = out[index] > 0 ? out[index] : 0; | |||||
| } | |||||
| } else { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[index] / in1[0]; | |||||
| out[index] = out[index] > 0 ? out[index] : 0; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[0] / in1[index]; | |||||
| } | |||||
| } else { | |||||
| if (in1[0] == 0) { | |||||
| return NNACL_ERRCODE_DIVISOR_ZERO; | |||||
| } | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[index] / in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param) { | |||||
| TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); | |||||
| return ElementAdd(tile_in0, tile_in1, out, size); | |||||
| } | |||||
| int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param) { | |||||
| TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); | |||||
| return ElementMul(tile_in0, tile_in1, out, size); | |||||
| } | |||||
| int ElementMul(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0, vin1); | |||||
| vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] * in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0, vin1); | |||||
| vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] * in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| int32x4_t bounds = vdupq_n_s32(6); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementAdd(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementAddRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vaddq_f32(vin0, vin1); | |||||
| vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] + in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementAddRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] + in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementAddInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] + in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSub(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSubInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vsubq_s32(vin0, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSubRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0, vin1); | |||||
| vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] - in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param) { | |||||
| TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); | |||||
| return ElementDiv(tile_in0, tile_in1, out, size); | |||||
| } | |||||
| int ElementDiv(const float *in0, const float *in1, float *out, int size) { | |||||
| for (int i = 0; i < size; i++) { | |||||
| out[i] = in0[i] / in1[i]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementDivRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| for (int i = 0; i < size; i++) { | |||||
| float res = in0[i] / in1[i]; | |||||
| out[i] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| for (int i = 0; i < size; i++) { | |||||
| out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementFloorMod(const float *in0, const float *in1, float *out, int size) { | int ElementFloorMod(const float *in0, const float *in1, float *out, int size) { | ||||
| for (int i = 0; i < size; i++) { | for (int i = 0; i < size; i++) { | ||||
| out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i]; | out[i] = in0[i] - floorf(in0[i] / in1[i]) * in1[i]; | ||||
| @@ -929,11 +123,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size) | |||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) { | |||||
| ElementSub(in0, in1, out, size); | |||||
| return ElementMul(out, out, out, size); | |||||
| } | |||||
| int ElementLogicalOr(const float *in0, const float *in1, float *out, int size) { | int ElementLogicalOr(const float *in0, const float *in1, float *out, int size) { | ||||
| int index = 0; | int index = 0; | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| @@ -22,6 +22,11 @@ | |||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| #include "nnacl/base/arithmetic_base.h" | #include "nnacl/base/arithmetic_base.h" | ||||
| #include "nnacl/errorcode.h" | #include "nnacl/errorcode.h" | ||||
| #include "nnacl/fp32/add_fp32.h" | |||||
| #include "nnacl/fp32/mul_fp32.h" | |||||
| #include "nnacl/fp32/div_fp32.h" | |||||
| #include "nnacl/fp32/sub_fp32.h" | |||||
| #include "nnacl/fp32/squared_difference.h" | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| @@ -30,56 +35,6 @@ void TileOneDimensionFp32(const float *inData, float *outData, int dim, size_t n | |||||
| const int *inStrides, const int *outStrides, const int *multiple); | const int *inStrides, const int *outStrides, const int *multiple); | ||||
| void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1, | void TileDimensionsFp32(const float *data0, const float *data1, float *tile_data0, float *tile_data1, | ||||
| ArithmeticParameter *param); | ArithmeticParameter *param); | ||||
| /* Mul */ | |||||
| int ElementMul(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementMulRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementMulRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementMulInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementMulReluInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param); | |||||
| /* Add */ | |||||
| int ElementAdd(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementAddRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementAddRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementAddInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementOptAdd(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptAddInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptAddRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptAddRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int BroadcastAdd(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param); | |||||
| /* Sub */ | |||||
| int ElementSub(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementSubInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementSubRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementSubRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| /* Div */ | |||||
| int ElementDiv(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementDivRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementDivRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param); | |||||
| /* logical and */ | /* logical and */ | ||||
| int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size); | int ElementLogicalAnd(const float *in0, const float *in1, float *out, int size); | ||||
| int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size); | int ElementLogicalAndInt(const int *in0, const int *in1, int *out, int size); | ||||
| @@ -88,9 +43,6 @@ int ElementLogicalAndBool(const bool *in0, const bool *in1, bool *out, int size) | |||||
| /* logical or */ | /* logical or */ | ||||
| int ElementLogicalOr(const float *in0, const float *in1, float *out, int size); | int ElementLogicalOr(const float *in0, const float *in1, float *out, int size); | ||||
| /* Element Squared Difference */ | |||||
| int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size); | |||||
| /* max min */ | /* max min */ | ||||
| int ElementMaximum(const float *in0, const float *in1, float *out, int size); | int ElementMaximum(const float *in0, const float *in1, float *out, int size); | ||||
| int ElementMinimum(const float *in0, const float *in1, float *out, int size); | int ElementMinimum(const float *in0, const float *in1, float *out, int size); | ||||
| @@ -0,0 +1,107 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "nnacl/fp32/div_fp32.h" | |||||
| #include <math.h> | |||||
| #include "nnacl/fp32/arithmetic_fp32.h" | |||||
| int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[0] / in1[index]; | |||||
| } | |||||
| } else { | |||||
| if (in1[0] == 0) { | |||||
| return NNACL_ERRCODE_DIVISOR_ZERO; | |||||
| } | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[index] / in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[0] / in1[index]; | |||||
| out[index] = out[index] > 0 ? out[index] : 0; | |||||
| } | |||||
| } else { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[index] / in1[0]; | |||||
| out[index] = out[index] > 0 ? out[index] : 0; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] / in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] / in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[0] / in1[index]; | |||||
| } | |||||
| } else { | |||||
| if (in1[0] == 0) { | |||||
| return NNACL_ERRCODE_DIVISOR_ZERO; | |||||
| } | |||||
| for (int index = 0; index < size; index++) { | |||||
| out[index] = in0[index] / in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param) { | |||||
| TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); | |||||
| return ElementDiv(tile_in0, tile_in1, out, size); | |||||
| } | |||||
| int ElementDiv(const float *in0, const float *in1, float *out, int size) { | |||||
| for (int i = 0; i < size; i++) { | |||||
| out[i] = in0[i] / in1[i]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementDivRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| for (int i = 0; i < size; i++) { | |||||
| float res = in0[i] / in1[i]; | |||||
| out[i] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementDivRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| for (int i = 0; i < size; i++) { | |||||
| out[i] = MSMIN(MSMAX(in0[i] / in1[i], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| @@ -0,0 +1,43 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_FP32_DIV_H_ | |||||
| #define MINDSPORE_LITE_NNACL_FP32_DIV_H_ | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/base/arithmetic_base.h" | |||||
| #include "nnacl/errorcode.h" | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| int ElementDiv(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementDivRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementDivRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementOptDiv(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptDivRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptDivRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptDivInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int BroadcastDiv(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_FP32_DIV_H_ | |||||
| @@ -19,6 +19,7 @@ | |||||
| #include <float.h> | #include <float.h> | ||||
| #include "nnacl/fp32/activation_fp32.h" | #include "nnacl/fp32/activation_fp32.h" | ||||
| #include "nnacl/fp32/arithmetic_fp32.h" | #include "nnacl/fp32/arithmetic_fp32.h" | ||||
| #include "nnacl/fp32/mul_fp32.h" | |||||
| void InitGate(float *gate_buffer, const float *bias, const LstmParameter *lstm_parm) { | void InitGate(float *gate_buffer, const float *bias, const LstmParameter *lstm_parm) { | ||||
| int gate_offest = 0; | int gate_offest = 0; | ||||
| @@ -0,0 +1,327 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "nnacl/fp32/mul_fp32.h" | |||||
| #include "nnacl/fp32/arithmetic_fp32.h" | |||||
| int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param) { | |||||
| TileDimensionsFp32(in0, in1, tile_in0, tile_in1, param); | |||||
| return ElementMul(tile_in0, tile_in1, out, size); | |||||
| } | |||||
| int ElementMul(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0, vin1); | |||||
| vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] * in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulReluInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0, vin1); | |||||
| vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] * in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| int32x4_t bounds = vdupq_n_s32(6); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0_opt, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] * in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmulq_f32(vin0, vin1_opt); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] * in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] * in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0_opt, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] * in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vmulq_s32(vin0, vin1_opt); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] * in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] * in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] * in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| int32x4_t zeros = vdupq_n_s32(0); | |||||
| int32x4_t bounds = vdupq_n_s32(6); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] * in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] * in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| @@ -0,0 +1,49 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_FP32_MUL_H_ | |||||
| #define MINDSPORE_LITE_NNACL_FP32_MUL_H_ | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/base/arithmetic_base.h" | |||||
| #include "nnacl/errorcode.h" | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| int ElementMul(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementMulRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementMulRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementMulInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementMulReluInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementMulRelu6Int(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementOptMul(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulReluInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptMulRelu6Int(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| int BroadcastMul(const float *in0, const float *in1, float *tile_in0, float *tile_in1, float *out, int size, | |||||
| ArithmeticParameter *param); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_FP32_MUL_H_ | |||||
| @@ -0,0 +1,28 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ | |||||
| #define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ | |||||
| #include "nnacl/fp32/squared_difference.h" | |||||
| #include "nnacl/fp32/sub_fp32.h" | |||||
| #include "nnacl/fp32/mul_fp32.h" | |||||
| int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size) { | |||||
| ElementSub(in0, in1, out, size); | |||||
| return ElementMul(out, out, out, size); | |||||
| } | |||||
| #endif // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ | |||||
| @@ -0,0 +1,37 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ | |||||
| #define MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/base/arithmetic_base.h" | |||||
| #include "nnacl/errorcode.h" | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| /* Element Squared Difference */ | |||||
| int ElementSquaredDifference(const float *in0, const float *in1, float *out, int size); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_SQUARED_DIFFERENCE_H_ | |||||
| @@ -0,0 +1,217 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "nnacl/fp32/sub_fp32.h" | |||||
| int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0_opt, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] - in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0, vin1_opt); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(in0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(in1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vsubq_s32(vin0_opt, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[0] - in1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vout = vsubq_s32(vin0, vin1_opt); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[0] - in1[index], 0); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMAX(in0[index] - in1[0], 0); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t vin0_opt = vdupq_n_f32(in0[0]); | |||||
| float32x4_t vin1_opt = vdupq_n_f32(in1[0]); | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[0] - in1[index], 0), 6); | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] - in1[0], 0), 6); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSub(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0, vin1); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSubInt(const int *in0, const int *in1, int *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(in0 + index); | |||||
| int32x4_t vin1 = vld1q_s32(in1 + index); | |||||
| int32x4_t vout = vsubq_s32(vin0, vin1); | |||||
| vst1q_s32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = in0[index] - in1[index]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSubRelu(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vsubq_f32(vin0, vin1); | |||||
| vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| float res = in0[index] - in1[index]; | |||||
| out[index] = res > 0 ? res : 0; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementSubRelu6(const float *in0, const float *in1, float *out, int size) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t zeros = vdupq_n_f32(0.0f); | |||||
| float32x4_t bounds = vdupq_n_f32(6.0f); | |||||
| for (; index <= size - 4; index += C4NUM) { | |||||
| float32x4_t vin0 = vld1q_f32(in0 + index); | |||||
| float32x4_t vin1 = vld1q_f32(in1 + index); | |||||
| float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds); | |||||
| vst1q_f32(out + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < size; index++) { | |||||
| out[index] = MSMIN(MSMAX(in0[index] - in1[index], 0), 6); | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| @@ -0,0 +1,43 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_SUB_FP32_H_ | |||||
| #define MINDSPORE_LITE_NNACL_SUB_FP32_H_ | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/base/arithmetic_base.h" | |||||
| #include "nnacl/errorcode.h" | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| int ElementSub(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementSubInt(const int *in0, const int *in1, int *out, int size); | |||||
| int ElementSubRelu(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementSubRelu6(const float *in0, const float *in1, float *out, int size); | |||||
| int ElementOptSub(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptSubRelu(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptSubRelu6(const float *in0, const float *in1, float *out, int size, const ArithmeticParameter *param); | |||||
| int ElementOptSubInt(const int *in0, const int *in1, int *out, int size, const ArithmeticParameter *param); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_SUB_FP32_H_ | |||||