Browse Source

[MSLITE][Develop] optimize arm cpu arithmetic: remove redundant code

tags/v1.1.0
yangruoqi713 5 years ago
parent
commit
43d0020564
3 changed files with 1061 additions and 2116 deletions
  1. +642
    -1228
      mindspore/lite/nnacl/fp16/arithmetic_fp16.c
  2. +417
    -886
      mindspore/lite/nnacl/fp32/arithmetic.c
  3. +2
    -2
      mindspore/lite/nnacl/fp32/lstm.c

+ 642
- 1228
mindspore/lite/nnacl/fp16/arithmetic_fp16.c
File diff suppressed because it is too large
View File


+ 417
- 886
mindspore/lite/nnacl/fp32/arithmetic.c
File diff suppressed because it is too large
View File


+ 2
- 2
mindspore/lite/nnacl/fp32/lstm.c View File

@@ -42,7 +42,7 @@ void MatMulAcc(float *output, const float *input, const float *weight, int rows,
int index = 0;
#ifdef ENABLE_ARM
float32x4_t out = vdupq_n_f32(0.0f);
for (; index < inner_size - 4; index += 4) {
for (; index <= inner_size - 4; index += 4) {
float32x4_t in_0 = vld1q_f32(input_col + index);
float32x4_t in_1 = vld1q_f32(weight_col + index);
out = vmlaq_f32(out, in_1, in_0);
@@ -66,7 +66,7 @@ void MatMulAcc(float *output, const float *input, const float *weight, int rows,
void ElementMulAcc(const float *input0, const float *input1, float *output, int element_size) {
int index = 0;
#ifdef ENABLE_ARM
for (; index < element_size - 4; index += 4) {
for (; index <= element_size - 4; index += 4) {
float32x4_t in_0 = vld1q_f32(input0 + index);
float32x4_t in_1 = vld1q_f32(input1 + index);
float32x4_t out = vld1q_f32(output + index);


Loading…
Cancel
Save