!8370 [MD] Use neon instruction set to accelerate the Substract and Divide ops

From: @jiangzhiwen8 Reviewed-by: @xulei2020 Signed-off-by:
5 years ago · edcb0cd86b
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.cc
@@ -19,8 +19,6 @@
 #include <string.h>
 #include <cmath>
 #include <vector>
 #include <algorithm>
 #include <limits>

 namespace mindspore {
 namespace dataset {
@@ -549,73 +547,39 @@ template <typename T>
 static void PadWithConstant(const LiteMat &src, LiteMat &dst, const int top, const int bottom, const int left,
                            const int right, const PaddBorderType pad_type, uint8_t fill_b_or_gray, uint8_t fill_g,
                            uint8_t fill_r) {
  dst.Init(src.width_ + left + right, src.height_ + top + bottom, src.channel_, src.data_type_);
  const T *src_start_p = src;
  T *dst_start_p = dst;
  // padd top
  for (int h = 0; h < top; h++) {
    for (int w = 0; w < dst.width_; w++) {
      uint32_t index = (h * dst.width_ + w) * dst.channel_;
      if (dst.channel_ == 1) {
        dst_start_p[index] = fill_b_or_gray;
      } else if (dst.channel_ == 3) {
        dst_start_p[index] = fill_b_or_gray;
        dst_start_p[index + 1] = fill_g;
        dst_start_p[index + 2] = fill_r;
      } else {
      }
  std::vector<uint8_t> row_buffer(dst.width_ * dst.channel_);
  uint8_t *const_ptr = row_buffer.data();
  int src_step = src.width_ * dst.channel_;
  int dst_step = dst.width_ * dst.channel_;
  if (dst.channel_ == 1) {
    for (int i = 0; i < dst_step; i++) {
      const_ptr[i] = fill_b_or_gray;
    }
  }
  // padd bottom
  for (int h = dst.height_ - bottom; h < dst.height_; h++) {
    for (int w = 0; w < dst.width_; w++) {
      uint32_t index = (h * dst.width_ + w) * dst.channel_;
      if (dst.channel_ == 1) {
        dst_start_p[index] = fill_b_or_gray;
      } else if (dst.channel_ == 3) {
        dst_start_p[index] = fill_b_or_gray;
        dst_start_p[index + 1] = fill_g;
        dst_start_p[index + 2] = fill_r;
      } else {
      }
  } else if (dst.channel_ == 3) {
    for (int i = 0; i < dst.width_; i++) {
      const_ptr[i * dst.channel_] = fill_b_or_gray;
      const_ptr[i * dst.channel_ + 1] = fill_g;
      const_ptr[i * dst.channel_ + 2] = fill_r;
    }
  }

  // padd left
  for (int h = top; h < dst.height_ - bottom; h++) {
    for (int w = 0; w < left; w++) {
      uint32_t index = (h * dst.width_ + w) * dst.channel_;
      if (dst.channel_ == 1) {
        dst_start_p[index] = fill_b_or_gray;
      } else if (dst.channel_ == 3) {
        dst_start_p[index] = fill_b_or_gray;
        dst_start_p[index + 1] = fill_g;
        dst_start_p[index + 2] = fill_r;
      } else {
      }
    }
  uint8_t *dst_ptr = reinterpret_cast<uint8_t *>(dst.data_ptr_);
  uint8_t *src_ptr = reinterpret_cast<uint8_t *>(src.data_ptr_);
  for (int i = 0; i < top; i++) {
    memcpy(dst_ptr + i * dst_step, const_ptr, dst_step);
  }

  // padd right
  for (int h = top; h < dst.height_ - bottom; h++) {
    for (int w = dst.width_ - right; w < dst.width_; w++) {
      uint32_t index = (h * dst.width_ + w) * dst.channel_;
      if (dst.channel_ == 1) {
        dst_start_p[index] = fill_b_or_gray;
      } else if (dst.channel_ == 3) {
        dst_start_p[index] = fill_b_or_gray;
        dst_start_p[index + 1] = fill_g;
        dst_start_p[index + 2] = fill_r;
      } else {
      }
    }
  int left_size = left * dst.channel_;
  int right_size = right * dst.channel_;
  uint8_t *dst_raw_data = dst_ptr + top * dst_step + left_size;
  for (int i = 0; i < src.width_; i++, dst_raw_data += dst_step, src_ptr += src_step) {
    memcpy(dst_raw_data, src_ptr, src_step);
    memcpy(dst_raw_data - left_size, const_ptr, left_size);
    memcpy(dst_raw_data + src_step, const_ptr, right_size);
  }
  // image data
  dst_start_p = dst_start_p + (top * dst.width_ + left) * dst.channel_;
  for (int i_h = 0; i_h < src.height_; i_h++) {
    const T *src_index_p = src_start_p + i_h * src.width_ * src.channel_;
    T *dst_index_p = dst_start_p + i_h * dst.width_ * dst.channel_;
    (void)memcpy(dst_index_p, src_index_p, src.width_ * src.channel_ * sizeof(T));

  for (int i = dst.height_ - bottom; i < dst.height_; i++) {
    memcpy(dst_ptr + i * dst_step, const_ptr, dst_step);
  }
 }

@@ -758,6 +722,15 @@ bool Pad(const LiteMat &src, LiteMat &dst, int top, int bottom, int left, int ri
  if (src.IsEmpty()) {
    return false;
  }
  int dst_width = src.width_ + left + right;
  int dst_height = src.height_ + top + bottom;
  if (dst.IsEmpty()) {
    dst.Init(dst_width, dst_height, src.channel_, src.data_type_);
  } else if (dst.width_ != dst_width || dst.height_ != dst_height || src.channel_ != dst.channel_) {
    return false;
  } else if (src.data_type_ != dst.data_type_) {
    return false;
  }
  if (pad_type == PADD_BORDER_CONSTANT && src.data_type_ == LDataType::FLOAT32) {
    PadWithConstant<float>(src, dst, top, bottom, left, right, pad_type, fill_b_or_gray, fill_g, fill_r);
  } else if (pad_type == PADD_BORDER_CONSTANT && src.data_type_ == LDataType::UINT8) {
@@ -921,167 +894,5 @@ bool Affine(LiteMat &src, LiteMat &out_img, const double M[6], std::vector<size_
  return ImplementAffine(src, out_img, M, dsize, borderValue);
 }

 template <typename T>
 inline void SubtractImpl(const T *src1_ptr, const T *src2_ptr, T *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    dst[i] = src1_ptr[i] - src2_ptr[i];
  }
 }

 template <>
 inline void SubtractImpl(const uint8_t *src1_ptr, const uint8_t *src2_ptr, uint8_t *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int val = static_cast<int>(src1_ptr[i]) - src2_ptr[i];
    dst[i] =
      std::max<int>(std::numeric_limits<uint8_t>::min(), std::min<int>(std::numeric_limits<uint8_t>::max(), val));
  }
 }

 template <>
 inline void SubtractImpl(const uint16_t *src1_ptr, const uint16_t *src2_ptr, uint16_t *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int val = static_cast<int>(src1_ptr[i]) - src2_ptr[i];
    dst[i] =
      std::max<int>(std::numeric_limits<uint16_t>::min(), std::min<int>(std::numeric_limits<uint16_t>::max(), val));
  }
 }

 template <>
 inline void SubtractImpl(const uint32_t *src1_ptr, const uint32_t *src2_ptr, uint32_t *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int64_t val = static_cast<int64_t>(src1_ptr[i]) - src2_ptr[i];
    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
  }
 }

 bool Subtract(const LiteMat &src1, const LiteMat &src2, LiteMat &dst) {
  if (src1.width_ != src2.width_ || src1.height_ != src2.height_ || src1.channel_ != src2.channel_) {
    return false;
  }

  if (src1.data_type_ != src2.data_type_) {
    return false;
  }

  if (dst.IsEmpty()) {
    dst.Init(src1.width_, src1.height_, src1.channel_, src1.data_type_);
  } else if (src1.width_ != dst.width_ || src1.height_ != dst.height_ || src1.channel_ != dst.channel_) {
    return false;
  } else if (src1.data_type_ != dst.data_type_) {
    return false;
  }

  size_t total_size = src1.height_ * src1.width_ * src1.channel_;

  if (src1.data_type_ == LDataType::BOOL) {
    SubtractImpl<bool>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::INT8) {
    SubtractImpl<int8_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT8) {
    SubtractImpl<uint8_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::INT16) {
    SubtractImpl<int16_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT16) {
    SubtractImpl<uint16_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::INT32) {
    SubtractImpl<int32_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT32) {
    SubtractImpl<uint32_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::INT64) {
    SubtractImpl<int64_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT64) {
    SubtractImpl<uint64_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::FLOAT32) {
    SubtractImpl<float>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::FLOAT64) {
    SubtractImpl<double>(src1, src2, dst, total_size);
  } else {
    return false;
  }

  return true;
 }

 template <typename T>
 inline void DivideImpl(const T *src1_ptr, const T *src2_ptr, T *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    dst[i] = src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min());
  }
 }

 template <>
 inline void DivideImpl(const uint8_t *src1_ptr, const uint8_t *src2_ptr, uint8_t *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min()));
    dst[i] =
      std::max<int>(std::numeric_limits<uint8_t>::min(), std::min<int>(std::numeric_limits<uint8_t>::max(), val));
  }
 }

 template <>
 inline void DivideImpl(const uint16_t *src1_ptr, const uint16_t *src2_ptr, uint16_t *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<float>::min()));
    dst[i] =
      std::max<int>(std::numeric_limits<uint16_t>::min(), std::min<int>(std::numeric_limits<uint16_t>::max(), val));
  }
 }

 template <>
 inline void DivideImpl(const uint32_t *src1_ptr, const uint32_t *src2_ptr, uint32_t *dst, size_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int64_t val = std::round(src1_ptr[i] / (src2_ptr[i] + std::numeric_limits<double>::min()));
    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
  }
 }

 bool Divide(const LiteMat &src1, const LiteMat &src2, LiteMat &dst) {
  if (src1.width_ != src2.width_ || src1.height_ != src2.height_ || src1.channel_ != src2.channel_) {
    return false;
  }

  if (src1.data_type_ != src2.data_type_) {
    return false;
  }

  if (dst.IsEmpty()) {
    dst.Init(src1.width_, src1.height_, src1.channel_, src1.data_type_);
  } else if (src1.width_ != dst.width_ || src1.height_ != dst.height_ || src1.channel_ != dst.channel_) {
    return false;
  } else if (src1.data_type_ != dst.data_type_) {
    return false;
  }

  size_t total_size = src1.height_ * src1.width_ * src1.channel_;

  if (src1.data_type_ == LDataType::INT8) {
    DivideImpl<int8_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT8) {
    DivideImpl<uint8_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::INT16) {
    DivideImpl<int16_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT16) {
    DivideImpl<uint16_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::INT32) {
    DivideImpl<int32_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT32) {
    DivideImpl<uint32_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::INT64) {
    DivideImpl<int64_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::UINT64) {
    DivideImpl<uint64_t>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::FLOAT32) {
    DivideImpl<float>(src1, src2, dst, total_size);
  } else if (src1.data_type_ == LDataType::FLOAT64) {
    DivideImpl<double>(src1, src2, dst, total_size);
  } else {
    return false;
  }

  return true;
 }

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/image_process.h
@@ -107,12 +107,6 @@ void ConvertBoxes(std::vector<std::vector<float>> &boxes, const std::vector<std:
 std::vector<int> ApplyNms(const std::vector<std::vector<float>> &all_boxes, std::vector<float> &all_scores, float thres,
                          int max_boxes);

 /// \brief Calculates the difference between the two images for each element
 bool Subtract(const LiteMat &src1, const LiteMat &src2, LiteMat &dst);

 /// \brief Calculates the division between the two images for each element
 bool Divide(const LiteMat &src1, const LiteMat &src2, LiteMat &dst);

 }  // namespace dataset
 }  // namespace mindspore
 #endif  // IMAGE_PROCESS_H_
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.cc
@@ -15,6 +15,16 @@
 */
 #include "minddata/dataset/kernels/image/lite_cv/lite_mat.h"

 #include <algorithm>
 #include <limits>
 #include <cmath>
 #ifdef ENABLE_ANDROID
 #if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
 #define USE_NEON
 #include <arm_neon.h>
 #endif
 #endif

 namespace mindspore {
 namespace dataset {

@@ -221,5 +231,258 @@ void LiteMat::AlignFree(void *ptr) {

 inline void LiteMat::InitElemSize(LDataType data_type) { elem_size_ = data_type.SizeInBytes(); }

 template <typename T>
 inline void SubtractImpl(const T *src0, const T *src1, T *dst, int64_t total_size) {
  for (int64_t i = 0; i < total_size; i++) {
    dst[i] = src0[i] - src1[i];
  }
 }

 template <>
 inline void SubtractImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) {
  int64_t x = 0;
 #ifdef USE_NEON
  const int64_t step = 32;
  for (; x <= total_size - step; x += step) {
    uint8x16_t v_src00 = vld1q_u8(src0 + x);
    uint8x16_t v_src01 = vld1q_u8(src0 + x + 16);
    uint8x16_t v_src10 = vld1q_u8(src1 + x);
    uint8x16_t v_src11 = vld1q_u8(src1 + x + 16);
    uint8x16_t v_dst;

    v_dst = vqsubq_u8(v_src00, v_src10);
    vst1q_u8(dst + x, v_dst);

    v_dst = vqsubq_u8(v_src01, v_src11);
    vst1q_u8(dst + x + 16, v_dst);
  }
 #endif
  for (; x < total_size; x++) {
    int32_t val = static_cast<int32_t>(src0[x]) - src1[x];
    dst[x] = std::max<int32_t>(std::numeric_limits<uint8_t>::min(),
                               std::min<int32_t>(std::numeric_limits<uint8_t>::max(), val));
  }
 }

 template <>
 inline void SubtractImpl(const uint16_t *src0, const uint16_t *src1, uint16_t *dst, int64_t total_size) {
  for (int64_t i = 0; i < total_size; i++) {
    int32_t val = static_cast<int32_t>(src0[i]) - src1[i];
    dst[i] = std::max<int32_t>(std::numeric_limits<uint16_t>::min(),
                               std::min<int32_t>(std::numeric_limits<uint16_t>::max(), val));
  }
 }

 template <>
 inline void SubtractImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *dst, int64_t total_size) {
  for (int64_t i = 0; i < total_size; i++) {
    int64_t val = static_cast<int64_t>(src0[i]) - src1[i];
    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
  }
 }

 bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
  if (src_a.width_ != src_b.width_ || src_a.height_ != src_b.height_ || src_a.channel_ != src_b.channel_) {
    return false;
  }

  if (src_a.data_type_ != src_b.data_type_) {
    return false;
  }

  if (dst->IsEmpty()) {
    dst->Init(src_a.width_, src_a.height_, src_a.channel_, src_a.data_type_);
  } else if (src_a.width_ != dst->width_ || src_a.height_ != dst->height_ || src_a.channel_ != dst->channel_) {
    return false;
  } else if (src_a.data_type_ != dst->data_type_) {
    return false;
  }

  int64_t total_size = src_a.height_ * src_a.width_ * src_a.channel_;
  if (src_a.data_type_ == LDataType::BOOL) {
    SubtractImpl<bool>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT8) {
    SubtractImpl<int8_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT8) {
    SubtractImpl<uint8_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT16) {
    SubtractImpl<int16_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT16) {
    SubtractImpl<uint16_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT32) {
    SubtractImpl<int32_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT32) {
    SubtractImpl<uint32_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT64) {
    SubtractImpl<int64_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT64) {
    SubtractImpl<uint64_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::FLOAT32) {
    SubtractImpl<float>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::FLOAT64) {
    SubtractImpl<double>(src_a, src_b, *dst, total_size);
  } else {
    return false;
  }

  return true;
 }

 #ifdef USE_NEON
 inline float32x4_t reciprocal_simd(float32x4_t val) {
  // get an initial estimate of 1/val
  float32x4_t reciprocal = vrecpeq_f32(val);

  // use Newton-Raphson steps to refine the estimate
  reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
  reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
  return reciprocal;
 }

 inline float32x4_t round_simd(const float32x4_t &v) {
  const int32x4_t signMask = vdupq_n_s32(1U << 31);
  const int32x4_t half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
  float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
  return vaddq_f32(v, v_addition);
 }
 #endif

 template <typename T>
 inline void DivideImpl(const T *src0, const T *src1, T *dst, int64_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    dst[i] = src1[i] ? src0[i] / src1[i] : 0;
  }
 }

 template <>
 inline void DivideImpl(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int64_t total_size) {
  int64_t x = 0;
 #ifdef USE_NEON
  const int64_t step = 16;
  for (; x <= total_size - step; x += step) {
    __builtin_prefetch(reinterpret_cast<const char *>(src0 + x) + 32 * 10);
    __builtin_prefetch(reinterpret_cast<const char *>(src1 + x) + 32 * 10);

    uint8x16_t v_a = vld1q_u8(src0 + x);
    uint8x16_t v_b = vld1q_u8(src1 + x);
    uint8x16_t v_mask = vtstq_u8(v_b, v_b);

    uint16x8_t va_l_16x8 = vmovl_u8(vget_low_u8(v_a));
    uint16x8_t va_h_16x8 = vmovl_u8(vget_high_u8(v_a));
    uint16x8_t vb_l_16x8 = vmovl_u8(vget_low_u8(v_b));
    uint16x8_t vb_h_16x8 = vmovl_u8(vget_high_u8(v_b));

    float32x4_t va_ll_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(va_l_16x8)));
    float32x4_t va_lh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(va_l_16x8)));
    float32x4_t va_hl_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(va_h_16x8)));
    float32x4_t va_hh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(va_h_16x8)));
    float32x4_t vb_ll_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vb_l_16x8)));
    float32x4_t vb_lh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vb_l_16x8)));
    float32x4_t vb_hl_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vb_h_16x8)));
    float32x4_t vb_hh_f32x4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vb_h_16x8)));

    float32x4_t vb_ll_re_f32x4 = reciprocal_simd(vb_ll_f32x4);
    float32x4_t vb_lh_re_f32x4 = reciprocal_simd(vb_lh_f32x4);
    float32x4_t vb_hl_re_f32x4 = reciprocal_simd(vb_hl_f32x4);
    float32x4_t vb_hh_re_f32x4 = reciprocal_simd(vb_hh_f32x4);

    float32x4_t dst_ll_f32x4 = round_simd(vmulq_f32(va_ll_f32x4, vb_ll_re_f32x4));
    float32x4_t dst_lh_f32x4 = round_simd(vmulq_f32(va_lh_f32x4, vb_lh_re_f32x4));
    float32x4_t dst_hl_f32x4 = round_simd(vmulq_f32(va_hl_f32x4, vb_hl_re_f32x4));
    float32x4_t dst_hh_f32x4 = round_simd(vmulq_f32(va_hh_f32x4, vb_hh_re_f32x4));

    uint32x4_t dst_ll_32x4 = vcvtq_u32_f32(dst_ll_f32x4);
    uint32x4_t dst_lh_32x4 = vcvtq_u32_f32(dst_lh_f32x4);
    uint32x4_t dst_hl_32x4 = vcvtq_u32_f32(dst_hl_f32x4);
    uint32x4_t dst_hh_32x4 = vcvtq_u32_f32(dst_hh_f32x4);

    uint16x4_t dst_ll_16x4 = vqmovn_u32(dst_ll_32x4);
    uint16x4_t dst_lh_16x4 = vqmovn_u32(dst_lh_32x4);
    uint16x4_t dst_hl_16x4 = vqmovn_u32(dst_hl_32x4);
    uint16x4_t dst_hh_16x4 = vqmovn_u32(dst_hh_32x4);

    uint16x8_t dst_l_16x8 = vcombine_u16(dst_ll_16x4, dst_lh_16x4);
    uint16x8_t dst_h_16x8 = vcombine_u16(dst_hl_16x4, dst_hh_16x4);

    int8x8_t dst_l_8x8 = vqmovn_u16(dst_l_16x8);
    int8x8_t dst_h_8x8 = vqmovn_u16(dst_h_16x8);
    int8x16_t dst_8x16 = vcombine_u8(dst_l_8x8, dst_h_8x8);

    dst_8x16 = vandq_u8(dst_8x16, v_mask);
    vst1q_u8(dst + x, dst_8x16);
  }
 #endif
  for (; x < total_size; x++) {
    int32_t val = src1[x] ? std::round(src0[x] / src1[x]) : 0;
    dst[x] = std::max<int32_t>(std::numeric_limits<uint8_t>::min(),
                               std::min<int32_t>(std::numeric_limits<uint8_t>::max(), val));
  }
 }

 template <>
 inline void DivideImpl(const uint16_t *src0, const uint16_t *src1, uint16_t *dst, int64_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int32_t val = src1[i] ? std::round(src0[i] / src1[i]) : 0;
    dst[i] = std::max<int32_t>(std::numeric_limits<uint16_t>::min(),
                               std::min<int32_t>(std::numeric_limits<uint16_t>::max(), val));
  }
 }

 template <>
 inline void DivideImpl(const uint32_t *src0, const uint32_t *src1, uint32_t *dst, int64_t total_size) {
  for (size_t i = 0; i < total_size; i++) {
    int64_t val = src1[i] ? std::round(src0[i] / src1[i]) : 0;
    dst[i] = std::max<int64_t>(std::numeric_limits<uint32_t>::min(),
                               std::min<int64_t>(std::numeric_limits<uint32_t>::max(), val));
  }
 }

 bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst) {
  if (src_a.width_ != src_b.width_ || src_a.height_ != src_b.height_ || src_a.channel_ != src_b.channel_) {
    return false;
  }

  if (src_a.data_type_ != src_b.data_type_) {
    return false;
  }

  if (dst->IsEmpty()) {
    dst->Init(src_a.width_, src_a.height_, src_a.channel_, src_a.data_type_);
  } else if (src_a.width_ != dst->width_ || src_a.height_ != dst->height_ || src_a.channel_ != dst->channel_) {
    return false;
  } else if (src_a.data_type_ != dst->data_type_) {
    return false;
  }

  int64_t total_size = src_a.height_ * src_a.width_ * src_a.channel_;
  if (src_a.data_type_ == LDataType::BOOL) {
    DivideImpl<bool>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT8) {
    DivideImpl<int8_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT8) {
    DivideImpl<uint8_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT16) {
    DivideImpl<int16_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT16) {
    DivideImpl<uint16_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT32) {
    DivideImpl<int32_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT32) {
    DivideImpl<uint32_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::INT64) {
    DivideImpl<int64_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::UINT64) {
    DivideImpl<uint64_t>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::FLOAT32) {
    DivideImpl<float>(src_a, src_b, *dst, total_size);
  } else if (src_a.data_type_ == LDataType::FLOAT64) {
    DivideImpl<double>(src_a, src_b, *dst, total_size);
  } else {
    return false;
  }
  return true;
 }

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/lite_cv/lite_mat.h
@@ -247,6 +247,13 @@ class LiteMat {
  LDataType data_type_;
  int *ref_count_;
 };

 /// \brief Calculates the difference between the two images for each element
 bool Subtract(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);

 /// \brief Calculates the division between the two images for each element
 bool Divide(const LiteMat &src_a, const LiteMat &src_b, LiteMat *dst);

 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINI_MAT_H_
--- a/tests/ut/cpp/dataset/image_process_test.cc
+++ b/tests/ut/cpp/dataset/image_process_test.cc
@@ -538,7 +538,7 @@ TEST_F(MindDataImageProcess, TestSubtractUint8) {
    static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i] = 1;
  }
  LiteMat dst_uint8;
  EXPECT_TRUE(Subtract(src1_uint8, src2_uint8, dst_uint8));
  EXPECT_TRUE(Subtract(src1_uint8, src2_uint8, &dst_uint8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i].c1,
              static_cast<UINT8_C1 *>(dst_uint8.data_ptr_)[i].c1);
@@ -557,7 +557,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt8) {
    static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i] = -1;
  }
  LiteMat dst_int8;
  EXPECT_TRUE(Subtract(src1_int8, src2_int8, dst_int8));
  EXPECT_TRUE(Subtract(src1_int8, src2_int8, &dst_int8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i].c1, static_cast<INT8_C1 *>(dst_int8.data_ptr_)[i].c1);
  }
@@ -575,7 +575,7 @@ TEST_F(MindDataImageProcess, TestSubtractUInt16) {
    static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i] = 0;
  }
  LiteMat dst_uint16;
  EXPECT_TRUE(Subtract(src1_uint16, src2_uint16, dst_uint16));
  EXPECT_TRUE(Subtract(src1_uint16, src2_uint16, &dst_uint16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i].c1,
              static_cast<UINT16_C1 *>(dst_uint16.data_ptr_)[i].c1);
@@ -594,7 +594,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt16) {
    static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i] = -1;
  }
  LiteMat dst_int16;
  EXPECT_TRUE(Subtract(src1_int16, src2_int16, dst_int16));
  EXPECT_TRUE(Subtract(src1_int16, src2_int16, &dst_int16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i].c1,
              static_cast<INT16_C1 *>(dst_int16.data_ptr_)[i].c1);
@@ -613,7 +613,7 @@ TEST_F(MindDataImageProcess, TestSubtractUInt32) {
    static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i] = 0;
  }
  LiteMat dst_uint32;
  EXPECT_TRUE(Subtract(src1_uint32, src2_uint32, dst_uint32));
  EXPECT_TRUE(Subtract(src1_uint32, src2_uint32, &dst_uint32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i].c1,
              static_cast<UINT32_C1 *>(dst_uint32.data_ptr_)[i].c1);
@@ -632,7 +632,7 @@ TEST_F(MindDataImageProcess, TestSubtractInt32) {
    static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i] = -2;
  }
  LiteMat dst_int32;
  EXPECT_TRUE(Subtract(src1_int32, src2_int32, dst_int32));
  EXPECT_TRUE(Subtract(src1_int32, src2_int32, &dst_int32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i].c1,
              static_cast<INT32_C1 *>(dst_int32.data_ptr_)[i].c1);
@@ -651,7 +651,7 @@ TEST_F(MindDataImageProcess, TestSubtractFloat) {
    static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i] = -2.3;
  }
  LiteMat dst_float;
  EXPECT_TRUE(Subtract(src1_float, src2_float, dst_float));
  EXPECT_TRUE(Subtract(src1_float, src2_float, &dst_float));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_FLOAT_EQ(static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i].c1,
                    static_cast<FLOAT32_C1 *>(dst_float.data_ptr_)[i].c1);
@@ -670,7 +670,7 @@ TEST_F(MindDataImageProcess, TestDivideUint8) {
    static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i] = 2;
  }
  LiteMat dst_uint8;
  EXPECT_TRUE(Divide(src1_uint8, src2_uint8, dst_uint8));
  EXPECT_TRUE(Divide(src1_uint8, src2_uint8, &dst_uint8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT8_C1 *>(expect_uint8.data_ptr_)[i].c1,
              static_cast<UINT8_C1 *>(dst_uint8.data_ptr_)[i].c1);
@@ -689,7 +689,7 @@ TEST_F(MindDataImageProcess, TestDivideInt8) {
    static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i] = -2;
  }
  LiteMat dst_int8;
  EXPECT_TRUE(Divide(src1_int8, src2_int8, dst_int8));
  EXPECT_TRUE(Divide(src1_int8, src2_int8, &dst_int8));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT8_C1 *>(expect_int8.data_ptr_)[i].c1, static_cast<INT8_C1 *>(dst_int8.data_ptr_)[i].c1);
  }
@@ -707,7 +707,7 @@ TEST_F(MindDataImageProcess, TestDivideUInt16) {
    static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i] = 2;
  }
  LiteMat dst_uint16;
  EXPECT_TRUE(Divide(src1_uint16, src2_uint16, dst_uint16));
  EXPECT_TRUE(Divide(src1_uint16, src2_uint16, &dst_uint16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT16_C1 *>(expect_uint16.data_ptr_)[i].c1,
              static_cast<UINT16_C1 *>(dst_uint16.data_ptr_)[i].c1);
@@ -726,7 +726,7 @@ TEST_F(MindDataImageProcess, TestDivideInt16) {
    static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i] = -10000;
  }
  LiteMat dst_int16;
  EXPECT_TRUE(Divide(src1_int16, src2_int16, dst_int16));
  EXPECT_TRUE(Divide(src1_int16, src2_int16, &dst_int16));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT16_C1 *>(expect_int16.data_ptr_)[i].c1,
              static_cast<INT16_C1 *>(dst_int16.data_ptr_)[i].c1);
@@ -745,7 +745,7 @@ TEST_F(MindDataImageProcess, TestDivideUInt32) {
    static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i] = 1000000000;
  }
  LiteMat dst_uint32;
  EXPECT_TRUE(Divide(src1_uint32, src2_uint32, dst_uint32));
  EXPECT_TRUE(Divide(src1_uint32, src2_uint32, &dst_uint32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<UINT32_C1 *>(expect_uint32.data_ptr_)[i].c1,
              static_cast<UINT32_C1 *>(dst_uint32.data_ptr_)[i].c1);
@@ -764,7 +764,7 @@ TEST_F(MindDataImageProcess, TestDivideInt32) {
    static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i] = -1000000000;
  }
  LiteMat dst_int32;
  EXPECT_TRUE(Divide(src1_int32, src2_int32, dst_int32));
  EXPECT_TRUE(Divide(src1_int32, src2_int32, &dst_int32));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_EQ(static_cast<INT32_C1 *>(expect_int32.data_ptr_)[i].c1,
              static_cast<INT32_C1 *>(dst_int32.data_ptr_)[i].c1);
@@ -783,7 +783,7 @@ TEST_F(MindDataImageProcess, TestDivideFloat) {
    static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i] = -6.17f;
  }
  LiteMat dst_float;
  EXPECT_TRUE(Divide(src1_float, src2_float, dst_float));
  EXPECT_TRUE(Divide(src1_float, src2_float, &dst_float));
  for (size_t i = 0; i < cols; i++) {
    EXPECT_FLOAT_EQ(static_cast<FLOAT32_C1 *>(expect_float.data_ptr_)[i].c1,
                    static_cast<FLOAT32_C1 *>(dst_float.data_ptr_)[i].c1);