!13581 [ms][lite][cpu] master softmax exp erf fp16 32 算子优化

From: @lzkcode Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tong
5 years ago · fbdd876396
--- a/mindspore/lite/nnacl/fp16/arithmetic_self_fp16.c
+++ b/mindspore/lite/nnacl/fp16/arithmetic_self_fp16.c
@@ -118,3 +118,10 @@ int ElementReciprocalFp16(float16_t *input, float16_t *output, int element_size)
  }
  return NNACL_OK;
 }

 int ElementErfFp16(float16_t *input, float16_t *output, int element_size) {
  for (int i = 0; i < element_size; i++) {
    output[i] = erff(input[i]);
  }
  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp16/arithmetic_self_fp16.h
+++ b/mindspore/lite/nnacl/fp16/arithmetic_self_fp16.h
@@ -50,6 +50,8 @@ int ElementCeilFp16(float16_t *input, float16_t *output, int number);
 int ElementNegativeFp16(float16_t *input, float16_t *output, int element_size);

 int ElementReciprocalFp16(float16_t *input, float16_t *output, int element_size);

 int ElementErfFp16(float16_t *input, float16_t *output, int element_size);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp16/exp_fp16.c
+++ b/mindspore/lite/nnacl/fp16/exp_fp16.c
@@ -0,0 +1,33 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "nnacl/fp16/exp_fp16.h"
 #include <math.h>
 #include <string.h>
 #include "nnacl/errorcode.h"

 void ExpFp16(const float16_t *src, float16_t *dst, int num) {
  int i = 0;
 #ifdef ENABLE_ARM64
  int count = (num / C8NUM) * C8NUM;
  for (; i < count; i += C8NUM) {
    simd_exp_fp16(vld1q_f16(src + i), dst + i);
  }
 #endif
  for (; i < num; ++i) {
    single_exp_fp16(src[i], dst + i);
  }
 }
--- a/mindspore/lite/nnacl/fp16/exp_fp16.h
+++ b/mindspore/lite/nnacl/fp16/exp_fp16.h
@@ -0,0 +1,70 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_NNACL_FP16_EXP_H_
 #define MINDSPORE_LITE_NNACL_FP16_EXP_H_

 #include "nnacl/op_base.h"

 #ifdef __cplusplus
 extern "C" {
 #endif
 void ExpFp16(const float16_t *src, float16_t *dst, int num);

 #if defined(ENABLE_ARM64)
 static inline float32x4_t exp_fp32(float32x4_t input) {
  static float32x4_t param[] = {{0.693147f, 0.693147f, 0.693147f, 0.693147f},
                                {1.0f / 120, 1.0f / 120, 1.0f / 120, 1.0f / 120},
                                {1.0f / 24, 1.0f / 24, 1.0f / 24, 1.0f / 24},
                                {1.0f / 6, 1.0f / 6, 1.0f / 6, 1.0f / 6},
                                {0.5f, 0.5f, 0.5f, 0.5f},
                                {1.0f, 1.0f, 1.0f, 1.0f}};
  int32x4_t integer = vcvtq_s32_f32(input / param[0]);
  float32x4_t decimal = input - vcvtq_f32_s32(integer) * param[0];
  int32x4_t int_exp = vshlq_s32((integer + vmovq_n_s32(127)), vmovq_n_s32(23));
  float32x4_t decimal_exp =
    param[5] +
    decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
  decimal_exp = decimal_exp * vld1q_f32((float *)(&int_exp));
  return decimal_exp;
 }

 static inline void simd_exp_fp16(float16x8_t input, float16_t *dst) {
  static float16x8_t maxv = {88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f};
  static float16x8_t minv = {-88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f};

  input = vmaxq_f16(minv, vminq_f16(input, maxv));
  float32x4_t input_low = vcvt_f32_f16(vget_low_f16(input));
  float32x4_t input_high = vcvt_high_f32_f16(input);
  vst1q_f16(dst, vcombine_f16(vcvt_f16_f32(exp_fp32(input_low)), vcvt_f16_f32(exp_fp32(input_high))));
 }
 #endif

 static inline void single_exp_fp16(float16_t src, float16_t *dst) {
  static float param[] = {0.693147f, 1.0f / 120, 1.0f / 24, 1.0f / 6, 1.0f / 2, 1.0f};
  src = MSMAX(-88.0f, MSMIN(88.0f, src));
  int integer = (float)src / param[0];
  float decimal = (float)src - integer * param[0];
  int int_exp = (integer + 127) << 23;
  float decimal_exp =
    1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
  *dst = (float16_t)(*((float *)&int_exp) * decimal_exp);
 }
 #ifdef __cplusplus
 }
 #endif

 #endif  // MINDSPORE_LITE_NNACL_FP16_EXP_H_
--- a/mindspore/lite/nnacl/fp16/softmax_fp16.c
+++ b/mindspore/lite/nnacl/fp16/softmax_fp16.c
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,7 +16,79 @@

 #include "nnacl/fp16/softmax_fp16.h"
 #include <math.h>
 #include <float.h>
 #include "nnacl/fp16/exp_fp16.h"

 void SoftmaxNormFp16(const float16_t *src, float16_t *dst, int batch, int channel) {
  int cur_batch_offset = 0;
  for (int i = 0; i < batch; i++, cur_batch_offset += channel) {
    int j = 0;
 #ifdef ENABLE_ARM64
    float16x8_t max_8 = vdupq_n_f16(-FLT16_MAX);
    int count = (channel / C8NUM) * C8NUM;
    for (; j < count; j += C8NUM) {
      float16x8_t input_8 = vld1q_f16(src + cur_batch_offset + j);
      max_8 = vmaxq_f16(max_8, input_8);
    }
    float16_t max = vmaxvq_f16(max_8);
 #else
    float16_t max = -FLT_MAX;
 #endif
    for (; j < channel; j++) {
      float16_t input = src[cur_batch_offset + j];
      if (input > max) {
        max = input;
      }
    }
    int k = 0;
 #ifdef ENABLE_NEON
    int count2 = (channel / C8NUM) * C8NUM;
    for (; k < count2; k += C8NUM) {
      float16x8_t input_8 = vld1q_f16(src + cur_batch_offset + k);
      float16x8_t output_8 = vsubq_f16(input_8, vdupq_n_f16(max));
      vst1q_f16(dst + cur_batch_offset + k, output_8);
    }
 #endif
    for (; k < channel; k++) {
      int offset = cur_batch_offset + k;
      dst[offset] = src[offset] - max;
    }
  }
 }

 void SumAndDivFp16(const float16_t *src, float16_t *dst, int batch, int channel) {
  int cur_batch_offset = 0;
  for (int i = 0; i < batch; i++, cur_batch_offset += channel) {
    float16_t sum = 0;
    int j = 0;
 #ifdef ENABLE_NEON
    float16x8_t sum8 = vdupq_n_f16(0);
    int count = (channel / C8NUM) * C8NUM;
    for (; j < count; j += C8NUM) {
      sum8 = vaddq_f16(sum8, vld1q_f16(src + cur_batch_offset + j));
    }
    sum = sum8[0] + sum8[1] + sum8[2] + sum8[3] + sum8[4] + sum8[5] + sum8[6] + sum8[7];
 #endif
    for (; j < channel; j++) {
      sum += src[cur_batch_offset + j];
    }
    int k = 0;
 #ifdef ENABLE_NEON
    const float16_t div = 1.0f / sum;
    for (; k < count; k += C8NUM) {
      vst1q_f16(dst + cur_batch_offset + k, vmulq_n_f16(vld1q_f16(src + cur_batch_offset + k), div));
    }
 #endif
    for (; k < channel; k++) {
      dst[cur_batch_offset + k] = src[cur_batch_offset + k] / sum;
    }
  }
 }

 void SoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, int batch, int channel) {
  SoftmaxNormFp16(src, dst, batch, channel);
  ExpFp16(dst, dst, batch * channel);
  SumAndDivFp16(dst, dst, batch, channel);
 }

 // output = exp(input) / reduce_sum(exp(input), axis)
 void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter) {
--- a/mindspore/lite/nnacl/fp16/softmax_fp16.h
+++ b/mindspore/lite/nnacl/fp16/softmax_fp16.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 extern "C" {
 #endif
 void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter);
 void SoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, int batch, int channel);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp32/exp_fp32.h
+++ b/mindspore/lite/nnacl/fp32/exp_fp32.h
@@ -53,11 +53,10 @@ static inline void simd_exp(MS_FLOAT32X4 input, float *dst) {
  MS_INT32X4 integer = MS_CVTQPS_EPI32(input / param[0]);
  MS_FLOAT32X4 decimal = input - MS_CVTQEPI32_PS(integer) * param[0];
  MS_INT32X4 int_exp = MS_SLLIQ_EPI32(MS_ADDQ_EPI32(integer, MS_MOVQ_EPI32(127)), 23);
  memcpy(dst, &int_exp, sizeof(int32_t) * 4);
  MS_FLOAT32X4 decimal_exp =
    param[5] +
    decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
  MS_STQ_F32(dst, decimal_exp * MS_LDQ_F32(dst));
  MS_STQ_F32(dst, decimal_exp * MS_LDQ_F32((float *)(&int_exp)));
 }
 #endif

@@ -76,11 +75,10 @@ static inline void simd_exp_avx(MS_FLOAT32X8 input, float *dst) {
  MS_INT32X8 integer = MS_CVT256PS_EPI32(input / param[0]);
  MS_FLOAT32X8 decimal = input - MS_CVT256EPI32_PS(integer) * param[0];
  MS_INT32X8 int_exp = MS_SLLI256_EPI32(MS_ADD256_EPI32(integer, MS_MOV256_EPI32(127)), 23);
  memcpy(dst, &int_exp, sizeof(int32_t) * 8);
  MS_FLOAT32X8 decimal_exp =
    param[5] +
    decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
  MS_ST256_F32(dst, decimal_exp * MS_LD256_F32(dst));
  MS_ST256_F32(dst, decimal_exp * MS_LD256_F32((float *)(&int_exp)));
 }
 #endif

@@ -90,10 +88,10 @@ static inline void single_exp(float src, float *dst) {
  int integer = src / param[0];
  float decimal = src - integer * param[0];
  int int_exp = (integer + 127) << 23;
  memcpy(dst, &int_exp, sizeof(float));
  const float decimal_exp =
  float decimal_exp =
    1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
  *dst *= decimal_exp;
  float *ptr = (float *)&int_exp;
  *dst = *ptr * decimal_exp;
 }
 #ifdef __cplusplus
 }
--- a/mindspore/lite/nnacl/fp32/softmax_fp32.c
+++ b/mindspore/lite/nnacl/fp32/softmax_fp32.c
@@ -22,14 +22,21 @@ void SoftmaxNorm(const float *src, float *dst, int batch, int channel) {
  int cur_batch_offset = 0;
  for (int i = 0; i < batch; i++, cur_batch_offset += channel) {
    int j = 0;
 #ifdef ENABLE_ARM64
 #ifdef ENABLE_NEON
    float32x4_t max4 = vdupq_n_f32(-FLT_MAX);
    int count = (channel / C4NUM) * C4NUM;
    for (; j < count; j += C4NUM) {
      float32x4_t input4 = vld1q_f32(src + cur_batch_offset + j);
      max4 = vmaxq_f32(max4, input4);
    }
 #ifdef ENABLE_ARM64
    float max = vmaxvq_f32(max4);
 #else
    float max = max4[0];
    for (int m = 1; m < 4; ++m) {
      max = MSMAX(max, max4[m]);
    }
 #endif
 #else
    float max = -FLT_MAX;
 #endif
@@ -66,7 +73,11 @@ void SumAndDiv(const float *src, float *dst, int batch, int channel) {
    for (; j < count; j += C4NUM) {
      sum4 = vaddq_f32(sum4, vld1q_f32(src + cur_batch_offset + j));
    }
 #ifdef ENABLE_ARM64
    sum = vaddvq_f32(sum4);
 #else
    sum = sum4[0] + sum4[1] + sum4[2] + sum4[3];
 #endif
 #endif
    for (; j < channel; j++) {
      sum += src[cur_batch_offset + j];
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@@ -61,6 +61,7 @@
 #define kInputSize2 3
 #define MAX_AXIS_SIZE 6
 #define MAX_LEN 256
 #define FLT16_MAX 65504

 typedef enum LiteDataType {
  kDataTypeFloat,
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
@@ -43,7 +43,8 @@ ArithmeticSelfFp16Func ArithmeticSelfFp16CPUKernel::GetArithmeticSelfFp16Fun(int
                                      {mindspore::schema::PrimitiveType_Ceil, ElementCeilFp16},
                                      {mindspore::schema::PrimitiveType_Round, ElementRoundFp16},
                                      {mindspore::schema::PrimitiveType_Neg, ElementNegativeFp16},
                                      {mindspore::schema::PrimitiveType_Reciprocal, ElementReciprocalFp16}};
                                      {mindspore::schema::PrimitiveType_Reciprocal, ElementReciprocalFp16},
                                      {mindspore::schema::PrimitiveType_Erf, ElementErfFp16}};
  for (size_t i = 0; i < sizeof(type_func_table) / sizeof(TYPE_FUNC_INFO); i++) {
    if (type_func_table[i].primitive_type_ == primitive_type) {
      return type_func_table[i].func_;
@@ -98,4 +99,5 @@ REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Ceil, LiteKernelCreator<Arith
 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Round, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Neg, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reciprocal, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Erf, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
@@ -43,59 +43,74 @@ int SoftmaxFp16CPUKernel::Init() {
  return ReSize();
 }

 int SoftmaxFp16CPUKernel::ReSize() { return SoftmaxBaseCPUKernel::ReSize(); }

 int SoftmaxFp16CPUKernel::MallocTmpBuffer() {
 int SoftmaxFp16CPUKernel::ReSize() {
  auto ret = SoftmaxBaseCPUKernel::ReSize();
  if (ret != RET_OK) {
    return ret;
  }
  auto n_dim = softmax_param_->n_dim_;
  auto axis = softmax_param_->axis_;
  if (axis == -1) {
    softmax_param_->axis_ += n_dim;
    axis = softmax_param_->axis_;
  }
  auto in_shape = in_tensors_.front()->shape();
  int out_plane_size = 1;
  out_plane_size_ = 1;
  for (int i = 0; i < axis; ++i) {
    out_plane_size *= in_shape[i];
    out_plane_size_ *= in_shape[i];
  }
  int in_plane_size = 1;
  in_plane_size_ = 1;
  for (int i = axis + 1; i < n_dim; i++) {
    in_plane_size *= in_shape[i];
    in_plane_size_ *= in_shape[i];
  }

  sum_data_ =
    reinterpret_cast<float16_t *>(context_->allocator->Malloc(out_plane_size * in_plane_size * sizeof(float16_t)));
  if (sum_data_ != nullptr) {
    free(sum_data_);
  }
  sum_data_ = reinterpret_cast<float16_t *>(malloc(out_plane_size_ * in_plane_size_ * sizeof(float16_t)));
  if (sum_data_ == nullptr) {
    MS_LOG(ERROR) << "malloc data for softmax fail!";
    return RET_ERROR;
  }
  memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float16_t));
  return RET_OK;
 }

 void SoftmaxFp16CPUKernel::FreeTmpBuffer() {
  if (sum_data_ != nullptr) {
    context_->allocator->Free(sum_data_);
    sum_data_ = nullptr;
  }
 int SoftmaxFp16CPUKernel::DoSoftmaxLastAxis(int task_id) {
  int unit = UP_DIV(out_plane_size_, context_->thread_num_);
  int begin = task_id * unit;
  int end = MSMIN(begin + unit, out_plane_size_);
  int channel = softmax_param_->input_shape_[softmax_param_->axis_];
  int offset = begin * channel;
  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(kInputIndex)->MutableData());
  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->MutableData());
  SoftmaxLastAxisFp16(input_ptr + offset, output_ptr + offset, end - begin, channel);
  return RET_OK;
 }

 int SoftmaxFp16CPUKernel::Run() {
  auto ret = MallocTmpBuffer();
 int SoftmaxLastAxisFp16Run(void *cdata, int task_id) {
  auto kernel = reinterpret_cast<SoftmaxFp16CPUKernel *>(cdata);
  auto ret = kernel->DoSoftmaxLastAxis(task_id);
  if (ret != RET_OK) {
    FreeTmpBuffer();
    MS_LOG(ERROR) << "MallocTmpBuffer failed";
    return RET_ERROR;
    MS_LOG(ERROR) << "DoSoftmaxLastAxisFp16 error task_id: " << task_id << ", ret: " << ret;
  }
  return ret;
 }

 int SoftmaxFp16CPUKernel::Run() {
  auto input_tensor = in_tensors_.at(0);
  MS_ASSERT(input_tensor);
  auto output_tensor = out_tensors_.at(0);

  MS_ASSERT(output_tensor);
  input_fp16_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  MS_ASSERT(input_fp16_);
  output_fp16_ = reinterpret_cast<float16_t *>(output_tensor->data_c());

  SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_);

  FreeTmpBuffer();
  MS_ASSERT(output_fp16_);
  if (in_plane_size_ == 1) {
    auto ret = ParallelLaunch(this->context_->thread_pool_, SoftmaxLastAxisFp16Run, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "SoftmaxFp16CPUKernel ParallelLaunch failed, ret: " << ret;
    }
    return ret;
  } else {
    MS_ASSERT(sum_data_);
    memset(sum_data_, 0, out_plane_size_ * in_plane_size_ * sizeof(float16_t));
    SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_);
  }
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.h
@@ -28,18 +28,25 @@ class SoftmaxFp16CPUKernel : public SoftmaxBaseCPUKernel {
  SoftmaxFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
      : SoftmaxBaseCPUKernel(parameter, inputs, outputs, ctx), sum_data_(nullptr) {}
  ~SoftmaxFp16CPUKernel() = default;
  ~SoftmaxFp16CPUKernel() override {
    if (sum_data_ != nullptr) {
      free(sum_data_);
    }
  }

  int Init() override;
  int ReSize() override;
  int Run() override;
  int MallocTmpBuffer();
  void FreeTmpBuffer();
  int DoSoftmaxLastAxis(int task_id);

 private:
  float16_t *sum_data_ = nullptr;
  float16_t *input_fp16_ = nullptr;
  float16_t *output_fp16_ = nullptr;
  int in_plane_size_ = 0;
  int out_plane_size_ = 0;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/test/models_tf.cfg
+++ b/mindspore/lite/test/models_tf.cfg
@@ -62,3 +62,5 @@ ml_video_edit_oneclick_adaptis.pb 3
 # Q_hand_0812.pb is not suitable for float16. Out of float16 range.
 Q_hand_0812.pb
 tacotron_encoder_stf.pb 5;1:1,62:1,62:1,62:1,62
 Q_inception-249970-672-11-16.pb 1
 Q_dila-small-mix-full-fineturn-390000-nopixel-nosigmoid.pb 1