| @@ -348,6 +348,38 @@ int ElementOptAdd(float *input0, float *input1, float *output, int element_size, | |||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int ElementOptAddInt(int *input0, int *input1, int *output, int element_size, ArithmeticParameter *param) { | |||||
| #ifdef ENABLE_NEON | |||||
| int32x4_t vin0_opt = vdupq_n_s32(input0[0]); | |||||
| int32x4_t vin1_opt = vdupq_n_s32(input1[0]); | |||||
| #endif | |||||
| int index = 0; | |||||
| if (param->in_elements_num0_ == 1) { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= element_size - 4; index += C4NUM) { | |||||
| int32x4_t vin1 = vld1q_s32(input1 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0_opt, vin1); | |||||
| vst1q_s32(output + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < element_size; index++) { | |||||
| output[index] = input0[0] + input1[index]; | |||||
| } | |||||
| } else { | |||||
| #ifdef ENABLE_NEON | |||||
| for (; index <= element_size - 4; index += C4NUM) { | |||||
| int32x4_t vin0 = vld1q_s32(input0 + index); | |||||
| int32x4_t vout = vaddq_s32(vin0, vin1_opt); | |||||
| vst1q_s32(output + index, vout); | |||||
| } | |||||
| #endif | |||||
| for (; index < element_size; index++) { | |||||
| output[index] = input0[index] + input1[0]; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ElementOptAddRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) { | int ElementOptAddRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) { | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| float32x4_t vin0_opt = vdupq_n_f32(input0[0]); | float32x4_t vin0_opt = vdupq_n_f32(input0[0]); | ||||
| @@ -739,6 +771,13 @@ int ElementFloorMod(float *input0, float *input1, float *output, int element_siz | |||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int ElementFloorModInt(int *input0, int *input1, int *output, int element_size) { | |||||
| for (int i = 0; i < element_size; i++) { | |||||
| output[i] = input0[i] - (input0[i] / input1[i]) * input1[i]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int BroadcastFloorMod(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | int BroadcastFloorMod(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | ||||
| int element_size, ArithmeticParameter *param) { | int element_size, ArithmeticParameter *param) { | ||||
| TileDimensions(input0, input1, tile_input0, tile_input1, param); | TileDimensions(input0, input1, tile_input0, tile_input1, param); | ||||
| @@ -752,6 +791,13 @@ int ElementFloorDiv(float *input0, float *input1, float *output, int element_siz | |||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int ElementFloorDivInt(int *input0, int *input1, int *output, int element_size) { | |||||
| for (int i = 0; i < element_size; i++) { | |||||
| output[i] = input0[i] / input1[i]; | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int BroadcastFloorDiv(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | int BroadcastFloorDiv(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | ||||
| int element_size, ArithmeticParameter *param) { | int element_size, ArithmeticParameter *param) { | ||||
| TileDimensions(input0, input1, tile_input0, tile_input1, param); | TileDimensions(input0, input1, tile_input0, tile_input1, param); | ||||
| @@ -27,6 +27,7 @@ | |||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | ||||
| int ElementOptAddInt(int *input0, int *input1, int *output, int element_size, ArithmeticParameter *param); | |||||
| int ElementOptAddRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | int ElementOptAddRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | ||||
| int ElementOptAddRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | int ElementOptAddRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | ||||
| int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param); | ||||
| @@ -87,10 +88,12 @@ int BroadcastMinimum(float *input0, float *input1, float *tile_input0, float *ti | |||||
| int element_size, ArithmeticParameter *param); | int element_size, ArithmeticParameter *param); | ||||
| int ElementFloorDiv(float *input0, float *input1, float *output, int element_size); | int ElementFloorDiv(float *input0, float *input1, float *output, int element_size); | ||||
| int ElementFloorDivInt(int *input0, int *input1, int *output, int element_size); | |||||
| int BroadcastFloorDiv(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | int BroadcastFloorDiv(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | ||||
| int element_size, ArithmeticParameter *param); | int element_size, ArithmeticParameter *param); | ||||
| int ElementFloorMod(float *input0, float *input1, float *output, int element_size); | int ElementFloorMod(float *input0, float *input1, float *output, int element_size); | ||||
| int ElementFloorModInt(int *input0, int *input1, int *output, int element_size); | |||||
| int BroadcastFloorMod(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | int BroadcastFloorMod(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, | ||||
| int element_size, ArithmeticParameter *param); | int element_size, ArithmeticParameter *param); | ||||
| @@ -83,6 +83,7 @@ int ArithmeticCPUKernel::ReSize() { | |||||
| default: | default: | ||||
| arithmeticParameter_->broadcasting_ = false; | arithmeticParameter_->broadcasting_ = false; | ||||
| arithmetic_opt_run_ = ElementOptAdd; | arithmetic_opt_run_ = ElementOptAdd; | ||||
| arithmetic_opt_run_int_ = ElementOptAddInt; | |||||
| break; | break; | ||||
| } | } | ||||
| break; | break; | ||||
| @@ -299,6 +300,7 @@ kernel::LiteKernel *CpuArithmeticFp32KernelCreator(const std::vector<lite::Tenso | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mul, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mul, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Mul, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Mul, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Add, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Add, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Add, CpuArithmeticFp32KernelCreator) | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Sub, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Sub, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Div, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Div, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, CpuArithmeticFp32KernelCreator) | ||||
| @@ -307,6 +309,8 @@ REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Maximum, CpuArithmeticFp32Ker | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Minimum, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Minimum, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FloorMod, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FloorMod, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_FloorDiv, CpuArithmeticFp32KernelCreator) | |||||
| REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_FloorMod, CpuArithmeticFp32KernelCreator) | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Equal, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Equal, CpuArithmeticFp32KernelCreator) | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_NotEqual, CpuArithmeticFp32KernelCreator) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_NotEqual, CpuArithmeticFp32KernelCreator) | ||||
| @@ -125,9 +125,11 @@ class ArithmeticCPUKernel : public LiteKernel { | |||||
| break; | break; | ||||
| case PrimitiveType_FloorDiv: | case PrimitiveType_FloorDiv: | ||||
| arithmetic_run_ = ElementFloorDiv; | arithmetic_run_ = ElementFloorDiv; | ||||
| arithmetic_run_int_ = ElementFloorDivInt; | |||||
| break; | break; | ||||
| case PrimitiveType_FloorMod: | case PrimitiveType_FloorMod: | ||||
| arithmetic_run_ = ElementFloorMod; | arithmetic_run_ = ElementFloorMod; | ||||
| arithmetic_run_int_ = ElementFloorModInt; | |||||
| break; | break; | ||||
| case PrimitiveType_Equal: | case PrimitiveType_Equal: | ||||
| arithmetic_run_ = ElementEqual; | arithmetic_run_ = ElementEqual; | ||||