Browse Source

[MSLITE] code check

tags/v1.6.0
ling 4 years ago
parent
commit
2f5c299438
13 changed files with 221 additions and 228 deletions
  1. +143
    -203
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.c
  2. +12
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.h
  3. +4
    -9
      mindspore/core/mindrt/include/async/collect.h
  4. +2
    -2
      mindspore/core/mindrt/src/async/uuid_base.cc
  5. +33
    -0
      mindspore/core/ops/splice.h
  6. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
  7. +0
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h
  8. +7
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.cc
  9. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc
  10. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h
  11. +6
    -6
      mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc
  12. +9
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h
  13. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc

+ 143
- 203
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.c View File

@@ -35,7 +35,7 @@ int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data
const float *src = src_b + c * param->inner_size_;
float *dst = dst_b + c * param->inner_size_;
double mean = 0.0f;
double square_mean = 0.0f;
double squ_m = 0.0f;

int index = 0;
#if defined(ENABLE_AVX)
@@ -46,7 +46,7 @@ int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data
__m128 square128 = _mm_add_ps(_mm256_extractf128_ps(squarev, 0), _mm256_extractf128_ps(squarev, 1));
for (int i = 0; i < C4NUM; ++i) {
mean += MS_F32X4_GETI(src128, i);
square_mean += MS_F32X4_GETI(square128, i);
squ_m += MS_F32X4_GETI(square128, i);
}
}
#endif
@@ -57,11 +57,11 @@ int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data
MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv);
#ifdef ENABLE_ARM64
mean += vaddvq_f32(srcv);
square_mean += vaddvq_f32(squarev);
squ_m += vaddvq_f32(squarev);
#elif defined(ENABLE_SSE)
for (int i = 0; i < C4NUM; ++i) {
mean += MS_F32X4_GETI(srcv, i);
square_mean += MS_F32X4_GETI(squarev, i);
squ_m += MS_F32X4_GETI(squarev, i);
}
#else
float32x2_t src_add2 = vadd_f32(vget_low_f32(srcv), vget_high_f32(srcv));
@@ -69,18 +69,18 @@ int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data
mean += vget_lane_f32(src_add4, 0);
float32x2_t square_add2 = vadd_f32(vget_low_f32(squarev), vget_high_f32(squarev));
float32x2_t square_add4 = vpadd_f32(square_add2, square_add2);
square_mean += vget_lane_f32(square_add4, 0);
squ_m += vget_lane_f32(square_add4, 0);
#endif
}
#endif
for (; index < param->inner_size_; index++) {
mean += src[index];
square_mean += src[index] * src[index];
squ_m += src[index] * src[index];
}

mean /= (float)param->inner_size_;
square_mean /= (float)param->inner_size_;
const double deno = gamma_data[c] / sqrt(square_mean - mean * mean + param->epsilon_);
squ_m /= (float)param->inner_size_;
const double deno = gamma_data[c] / sqrt(squ_m - mean * mean + param->epsilon_);

index = 0;
#if defined(ENABLE_AVX)
@@ -112,6 +112,112 @@ int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data
return NNACL_OK;
}

#if defined(ENABLE_SSE) || defined(ENABLE_ARM)
void InstanceNormC4HW4ArmSse(const float *src_b, float *dst_b, const float *gamma_data, const float *beta_data,
int *c_src, const InstanceNormParameter *param, int channel, int channel_end, int hw_plane,
MS_FLOAT32X4 hw_planev) {
int c = *c_src;
for (; c <= channel_end - C16NUM; c += C16NUM) {
const float *src = src_b + c * hw_plane, *src1 = src_b + (c + C4NUM) * hw_plane;
const float *src2 = src_b + (c + C8NUM) * hw_plane, *src3 = src_b + (c + C12NUM) * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f), mean1 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 mean2 = MS_MOVQ_F32(0.0f), mean3 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 squ_m = MS_MOVQ_F32(0.0f), squ_m1 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 squ_m2 = MS_MOVQ_F32(0.0f), squ_m3 = MS_MOVQ_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM), srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 srcv2 = MS_LDQ_F32(src2 + index * C4NUM), srcv3 = MS_LDQ_F32(src3 + index * C4NUM);
MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv), squarev1 = MS_MULQ_F32(srcv1, srcv1);
MS_FLOAT32X4 squarev2 = MS_MULQ_F32(srcv2, srcv2), squarev3 = MS_MULQ_F32(srcv3, srcv3);
MS_ADDQ_F32_VEC(mean, mean1, mean2, mean3, srcv, srcv1, srcv2, srcv3);
MS_ADDQ_F32_VEC(squ_m, squ_m1, squ_m2, squ_m3, squarev, squarev1, squarev2, squarev3);
}
MS_DIVQ_F32_VEC(mean, mean1, mean2, mean3, hw_planev);
MS_DIVQ_F32_VEC(squ_m, squ_m1, squ_m2, squ_m3, hw_planev);

MS_FLOAT32X4 deno = MS_ADDQ_F32(MS_SUBQ_F32(squ_m, MS_MULQ_F32(mean, mean)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno1 = MS_ADDQ_F32(MS_SUBQ_F32(squ_m1, MS_MULQ_F32(mean1, mean1)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno2 = MS_ADDQ_F32(MS_SUBQ_F32(squ_m2, MS_MULQ_F32(mean2, mean2)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno3 = MS_ADDQ_F32(MS_SUBQ_F32(squ_m3, MS_MULQ_F32(mean3, mean3)), MS_MOVQ_F32(param->epsilon_));

deno = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno));
deno1 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno1));
deno2 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno2));
deno3 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno3));

MS_FLOAT32X4 gammav = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c), deno); // deno * gamma_data[c]
MS_FLOAT32X4 gammav1 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C4NUM), deno1); // deno * gamma_data[c]
MS_FLOAT32X4 gammav2 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C8NUM), deno2); // deno * gamma_data[c]
MS_FLOAT32X4 gammav3 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C12NUM), deno3); // deno * gamma_data[c]
MS_FLOAT32X4 betav = MS_LDQ_F32(beta_data + c), betav1 = MS_LDQ_F32(beta_data + c + C4NUM);
MS_FLOAT32X4 betav2 = MS_LDQ_F32(beta_data + c + C8NUM), betav3 = MS_LDQ_F32(beta_data + c + C12NUM);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM), srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 srcv2 = MS_LDQ_F32(src2 + index * C4NUM), srcv3 = MS_LDQ_F32(src3 + index * C4NUM);
MS_FLOAT32X4 outv = MS_SUBQ_F32(srcv, mean), outv1 = MS_SUBQ_F32(srcv1, mean1);
MS_FLOAT32X4 outv2 = MS_SUBQ_F32(srcv2, mean2), outv3 = MS_SUBQ_F32(srcv3, mean3);

outv = MS_MULQ_F32(outv, gammav), outv1 = MS_MULQ_F32(outv1, gammav1);
outv2 = MS_MULQ_F32(outv2, gammav2), outv3 = MS_MULQ_F32(outv3, gammav3);
MS_ADDQ_F32_VEC(outv, outv1, outv2, outv3, betav, betav1, betav2, betav3);

MS_STQ_F32(dst + index * channel, outv), MS_STQ_F32(dst + index * channel + C4NUM, outv1);
MS_STQ_F32(dst + index * channel + C8NUM, outv2), MS_STQ_F32(dst + index * channel + C12NUM, outv3);
}
}
for (; c <= channel_end - C8NUM; c += C8NUM) {
const float *src = src_b + c * hw_plane, *src1 = src_b + (c + C4NUM) * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f), mean1 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 squ_m = MS_MOVQ_F32(0.0f), squ_m1 = MS_MOVQ_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM), srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv), squarev1 = MS_MULQ_F32(srcv1, srcv1);
mean = MS_ADDQ_F32(mean, srcv), mean1 = MS_ADDQ_F32(mean1, srcv1);
squ_m = MS_ADDQ_F32(squ_m, squarev), squ_m1 = MS_ADDQ_F32(squ_m1, squarev1);
}

MS_DIVQ_F32_VEC(mean, mean1, squ_m, squ_m1, hw_planev);
MS_FLOAT32X4 deno = MS_ADDQ_F32(MS_SUBQ_F32(squ_m, MS_MULQ_F32(mean, mean)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno1 = MS_ADDQ_F32(MS_SUBQ_F32(squ_m1, MS_MULQ_F32(mean1, mean1)), MS_MOVQ_F32(param->epsilon_));
deno = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno));
deno1 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno1));

MS_FLOAT32X4 gammav = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c), deno); // deno * gamma_data[c]
MS_FLOAT32X4 gammav1 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C4NUM), deno1); // deno * gamma_data[c]
MS_FLOAT32X4 betav = MS_LDQ_F32(beta_data + c), betav1 = MS_LDQ_F32(beta_data + c + C4NUM);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM), srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 outv = MS_SUBQ_F32(srcv, mean), outv1 = MS_SUBQ_F32(srcv1, mean1);
outv = MS_MULQ_F32(outv, gammav), outv1 = MS_MULQ_F32(outv1, gammav1);
outv = MS_ADDQ_F32(outv, betav), outv1 = MS_ADDQ_F32(outv1, betav1);
MS_STQ_F32(dst + index * channel, outv);
MS_STQ_F32(dst + index * channel + C4NUM, outv1);
}
}
for (; c <= channel_end - C4NUM; c += C4NUM) {
const float *src = src_b + c * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f), squ_m = MS_MOVQ_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM), squarev = MS_MULQ_F32(srcv, srcv);
mean = MS_ADDQ_F32(mean, srcv), squ_m = MS_ADDQ_F32(squ_m, squarev);
}
mean = MS_DIVQ_F32(mean, hw_planev), squ_m = MS_DIVQ_F32(squ_m, hw_planev);
MS_FLOAT32X4 deno = MS_ADDQ_F32(MS_SUBQ_F32(squ_m, MS_MULQ_F32(mean, mean)), MS_MOVQ_F32(param->epsilon_));
deno = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno));

MS_FLOAT32X4 gammav = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c), deno), betav = MS_LDQ_F32(beta_data + c);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM), outv = MS_SUBQ_F32(srcv, mean);
MS_STQ_F32(dst + index * channel, MS_ADDQ_F32(MS_MULQ_F32(outv, gammav), betav));
}
}
*c_src = c;
}
#endif

int InstanceNormNC4HW4(const float *src_data, float *dst_data, const float *gamma_data, const float *beta_data,
const InstanceNormParameter *param, size_t task_id) {
NNACL_CHECK_NULL_RETURN_ERR(src_data);
@@ -130,161 +236,7 @@ int InstanceNormNC4HW4(const float *src_data, float *dst_data, const float *gamm
float *dst_b = dst_data + b * channel * hw_plane;
int c = channel_begin;
#if defined(ENABLE_ARM) || defined(ENABLE_SSE)
for (; c <= channel_end - C16NUM; c += C16NUM) {
const float *src = src_b + c * hw_plane;
const float *src1 = src_b + (c + C4NUM) * hw_plane;
const float *src2 = src_b + (c + C8NUM) * hw_plane;
const float *src3 = src_b + (c + C12NUM) * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 mean1 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 mean2 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 mean3 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 square_mean = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 square_mean1 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 square_mean2 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 square_mean3 = MS_MOVQ_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
MS_FLOAT32X4 srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 srcv2 = MS_LDQ_F32(src2 + index * C4NUM);
MS_FLOAT32X4 srcv3 = MS_LDQ_F32(src3 + index * C4NUM);
MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv);
MS_FLOAT32X4 squarev1 = MS_MULQ_F32(srcv1, srcv1);
MS_FLOAT32X4 squarev2 = MS_MULQ_F32(srcv2, srcv2);
MS_FLOAT32X4 squarev3 = MS_MULQ_F32(srcv3, srcv3);
mean = MS_ADDQ_F32(mean, srcv);
mean1 = MS_ADDQ_F32(mean1, srcv1);
mean2 = MS_ADDQ_F32(mean2, srcv2);
mean3 = MS_ADDQ_F32(mean3, srcv3);
square_mean = MS_ADDQ_F32(square_mean, squarev);
square_mean1 = MS_ADDQ_F32(square_mean1, squarev1);
square_mean2 = MS_ADDQ_F32(square_mean2, squarev2);
square_mean3 = MS_ADDQ_F32(square_mean3, squarev3);
}
mean = MS_DIVQ_F32(mean, hw_planev);
mean1 = MS_DIVQ_F32(mean1, hw_planev);
mean2 = MS_DIVQ_F32(mean2, hw_planev);
mean3 = MS_DIVQ_F32(mean3, hw_planev);
square_mean = MS_DIVQ_F32(square_mean, hw_planev);
square_mean1 = MS_DIVQ_F32(square_mean1, hw_planev);
square_mean2 = MS_DIVQ_F32(square_mean2, hw_planev);
square_mean3 = MS_DIVQ_F32(square_mean3, hw_planev);
MS_FLOAT32X4 deno = MS_ADDQ_F32(MS_SUBQ_F32(square_mean, MS_MULQ_F32(mean, mean)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno1 =
MS_ADDQ_F32(MS_SUBQ_F32(square_mean1, MS_MULQ_F32(mean1, mean1)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno2 =
MS_ADDQ_F32(MS_SUBQ_F32(square_mean2, MS_MULQ_F32(mean2, mean2)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno3 =
MS_ADDQ_F32(MS_SUBQ_F32(square_mean3, MS_MULQ_F32(mean3, mean3)), MS_MOVQ_F32(param->epsilon_));
deno = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno));
deno1 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno1));
deno2 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno2));
deno3 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno3));

MS_FLOAT32X4 gammav = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c), deno); // deno * gamma_data[c]
MS_FLOAT32X4 gammav1 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C4NUM), deno1); // deno * gamma_data[c]
MS_FLOAT32X4 gammav2 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C8NUM), deno2); // deno * gamma_data[c]
MS_FLOAT32X4 gammav3 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C12NUM), deno3); // deno * gamma_data[c]
MS_FLOAT32X4 betav = MS_LDQ_F32(beta_data + c);
MS_FLOAT32X4 betav1 = MS_LDQ_F32(beta_data + c + C4NUM);
MS_FLOAT32X4 betav2 = MS_LDQ_F32(beta_data + c + C8NUM);
MS_FLOAT32X4 betav3 = MS_LDQ_F32(beta_data + c + C12NUM);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
MS_FLOAT32X4 srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 srcv2 = MS_LDQ_F32(src2 + index * C4NUM);
MS_FLOAT32X4 srcv3 = MS_LDQ_F32(src3 + index * C4NUM);
MS_FLOAT32X4 outv = MS_SUBQ_F32(srcv, mean);
MS_FLOAT32X4 outv1 = MS_SUBQ_F32(srcv1, mean1);
MS_FLOAT32X4 outv2 = MS_SUBQ_F32(srcv2, mean2);
MS_FLOAT32X4 outv3 = MS_SUBQ_F32(srcv3, mean3);
outv = MS_MULQ_F32(outv, gammav);
outv1 = MS_MULQ_F32(outv1, gammav1);
outv2 = MS_MULQ_F32(outv2, gammav2);
outv3 = MS_MULQ_F32(outv3, gammav3);
outv = MS_ADDQ_F32(outv, betav);
outv1 = MS_ADDQ_F32(outv1, betav1);
outv2 = MS_ADDQ_F32(outv2, betav2);
outv3 = MS_ADDQ_F32(outv3, betav3);
MS_STQ_F32(dst + index * channel, outv);
MS_STQ_F32(dst + index * channel + C4NUM, outv1);
MS_STQ_F32(dst + index * channel + C8NUM, outv2);
MS_STQ_F32(dst + index * channel + C12NUM, outv3);
}
}
for (; c <= channel_end - C8NUM; c += C8NUM) {
const float *src = src_b + c * hw_plane;
const float *src1 = src_b + (c + C4NUM) * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 mean1 = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 square_mean = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 square_mean1 = MS_MOVQ_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
MS_FLOAT32X4 srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv);
MS_FLOAT32X4 squarev1 = MS_MULQ_F32(srcv1, srcv1);
mean = MS_ADDQ_F32(mean, srcv);
mean1 = MS_ADDQ_F32(mean1, srcv1);
square_mean = MS_ADDQ_F32(square_mean, squarev);
square_mean1 = MS_ADDQ_F32(square_mean1, squarev1);
}
mean = MS_DIVQ_F32(mean, hw_planev);
mean1 = MS_DIVQ_F32(mean1, hw_planev);
square_mean = MS_DIVQ_F32(square_mean, hw_planev);
square_mean1 = MS_DIVQ_F32(square_mean1, hw_planev);
MS_FLOAT32X4 deno = MS_ADDQ_F32(MS_SUBQ_F32(square_mean, MS_MULQ_F32(mean, mean)), MS_MOVQ_F32(param->epsilon_));
MS_FLOAT32X4 deno1 =
MS_ADDQ_F32(MS_SUBQ_F32(square_mean1, MS_MULQ_F32(mean1, mean1)), MS_MOVQ_F32(param->epsilon_));
deno = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno));
deno1 = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno1));

MS_FLOAT32X4 gammav = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c), deno); // deno * gamma_data[c]
MS_FLOAT32X4 gammav1 = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c + C4NUM), deno1); // deno * gamma_data[c]
MS_FLOAT32X4 betav = MS_LDQ_F32(beta_data + c);
MS_FLOAT32X4 betav1 = MS_LDQ_F32(beta_data + c + C4NUM);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
MS_FLOAT32X4 srcv1 = MS_LDQ_F32(src1 + index * C4NUM);
MS_FLOAT32X4 outv = MS_SUBQ_F32(srcv, mean);
MS_FLOAT32X4 outv1 = MS_SUBQ_F32(srcv1, mean1);
outv = MS_MULQ_F32(outv, gammav);
outv1 = MS_MULQ_F32(outv1, gammav1);
outv = MS_ADDQ_F32(outv, betav);
outv1 = MS_ADDQ_F32(outv1, betav1);
MS_STQ_F32(dst + index * channel, outv);
MS_STQ_F32(dst + index * channel + C4NUM, outv1);
}
}
for (; c <= channel_end - C4NUM; c += C4NUM) {
const float *src = src_b + c * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f);
MS_FLOAT32X4 square_mean = MS_MOVQ_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv);
mean = MS_ADDQ_F32(mean, srcv);
square_mean = MS_ADDQ_F32(square_mean, squarev);
}
mean = MS_DIVQ_F32(mean, hw_planev);
square_mean = MS_DIVQ_F32(square_mean, hw_planev);
MS_FLOAT32X4 deno =
MS_ADDQ_F32(MS_SUBQ_F32(square_mean, MS_MULQ_F32(mean, mean)), MS_MOVQ_F32(param->epsilon_)); // question
deno = MS_DIVQ_F32(MS_MOVQ_F32(1.0f), MS_SQRTFX4_F32(deno));

MS_FLOAT32X4 gammav = MS_MULQ_F32(MS_LDQ_F32(gamma_data + c), deno); // deno * gamma_data[c]
MS_FLOAT32X4 betav = MS_LDQ_F32(beta_data + c);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index * C4NUM);
MS_FLOAT32X4 outv = MS_SUBQ_F32(srcv, mean);
outv = MS_MULQ_F32(outv, gammav);
outv = MS_ADDQ_F32(outv, betav);
MS_STQ_F32(dst + index * channel, outv);
}
}
InstanceNormC4HW4ArmSse(src_b, dst_b, gamma_data, beta_data, &c, param, channel, channel_end, hw_plane, hw_planev);
#endif
for (; c < channel_end; ++c) {
int c4_down_loop = c / C4NUM * C4NUM;
@@ -293,15 +245,15 @@ int InstanceNormNC4HW4(const float *src_data, float *dst_data, const float *gamm
const float *src = src_b + c4_down_loop * hw_plane + c4_mod;
float *dst = dst_b + c;
float mean = 0.0f;
float square_mean = 0.0f;
float squ_m = 0.0f;
for (int index = 0; index < hw_plane; ++index) {
float tmp = src[index * c_res];
mean += tmp;
square_mean += tmp * tmp;
squ_m += tmp * tmp;
}
mean /= (float)hw_plane;
square_mean /= (float)hw_plane;
const float deno = gamma_data[c] / sqrtf(square_mean - mean * mean + param->epsilon_);
squ_m /= (float)hw_plane;
const float deno = gamma_data[c] / sqrtf(squ_m - mean * mean + param->epsilon_);
for (int index = 0; index < hw_plane; ++index) {
dst[index * channel] = (src[index * c_res] - mean) * deno + beta_data[c];
}
@@ -316,8 +268,7 @@ int InstanceNormNC8HW8(const float *src_data, float *dst_data, const float *gamm
NNACL_CHECK_NULL_RETURN_ERR(src_data);
NNACL_CHECK_NULL_RETURN_ERR(dst_data);
NNACL_CHECK_ZERO_RETURN_ERR(param->op_parameter_.thread_num_);
int channel = param->channel_;
int hw_plane = param->inner_size_;
int channel = param->channel_, hw_plane = param->inner_size_;
int channel_step = UP_DIV(UP_DIV(channel, C8NUM), param->op_parameter_.thread_num_) * C8NUM;
int channel_begin = (int)(task_id)*channel_step;
int channel_end = MSMIN(channel_begin + channel_step, channel);
@@ -330,40 +281,33 @@ int InstanceNormNC8HW8(const float *src_data, float *dst_data, const float *gamm
const float *src = src_b + c * hw_plane;
const float *src1 = src_b + (c + C8NUM) * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X8 mean = MS_MOV256_F32(0.0f);
MS_FLOAT32X8 mean1 = MS_MOV256_F32(0.0f);
MS_FLOAT32X8 square_mean = MS_MOV256_F32(0.0f);
MS_FLOAT32X8 square_mean1 = MS_MOV256_F32(0.0f);
MS_FLOAT32X8 mean = MS_MOV256_F32(0.0f), mean1 = MS_MOV256_F32(0.0f);
MS_FLOAT32X8 squ_m = MS_MOV256_F32(0.0f), squ_m1 = MS_MOV256_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM);
MS_FLOAT32X8 srcv1 = MS_LD256_F32(src1 + index * C8NUM);
MS_FLOAT32X8 squarev = MS_MUL256_F32(srcv, srcv);
MS_FLOAT32X8 squarev1 = MS_MUL256_F32(srcv1, srcv1);
MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM), srcv1 = MS_LD256_F32(src1 + index * C8NUM);
MS_FLOAT32X8 squarev = MS_MUL256_F32(srcv, srcv), squarev1 = MS_MUL256_F32(srcv1, srcv1);
mean = MS_ADD256_F32(mean, srcv);
mean1 = MS_ADD256_F32(mean1, srcv1);
square_mean = MS_ADD256_F32(square_mean, squarev);
square_mean1 = MS_ADD256_F32(square_mean1, squarev1);
squ_m = MS_ADD256_F32(squ_m, squarev);
squ_m1 = MS_ADD256_F32(squ_m1, squarev1);
}
mean = MS_DIV256_F32(mean, hw_planev);
mean1 = MS_DIV256_F32(mean1, hw_planev);
square_mean = MS_DIV256_F32(square_mean, hw_planev);
square_mean1 = MS_DIV256_F32(square_mean1, hw_planev);
squ_m = MS_DIV256_F32(squ_m, hw_planev);
squ_m1 = MS_DIV256_F32(squ_m1, hw_planev);
MS_FLOAT32X8 deno =
MS_ADD256_F32(MS_SUB256_F32(square_mean, MS_MUL256_F32(mean, mean)), MS_MOV256_F32(param->epsilon_));
MS_ADD256_F32(MS_SUB256_F32(squ_m, MS_MUL256_F32(mean, mean)), MS_MOV256_F32(param->epsilon_));
MS_FLOAT32X8 deno1 =
MS_ADD256_F32(MS_SUB256_F32(square_mean1, MS_MUL256_F32(mean1, mean1)), MS_MOV256_F32(param->epsilon_));
MS_ADD256_F32(MS_SUB256_F32(squ_m1, MS_MUL256_F32(mean1, mean1)), MS_MOV256_F32(param->epsilon_));
deno = MS_DIV256_F32(MS_MOV256_F32(1.0f), MS_SQRTFX8_F32(deno));
deno1 = MS_DIV256_F32(MS_MOV256_F32(1.0f), MS_SQRTFX8_F32(deno1));

MS_FLOAT32X8 gammav = MS_MUL256_F32(MS_LD256_F32(gamma_data + c), deno); // deno * gamma_data[c]
MS_FLOAT32X8 gammav1 = MS_MUL256_F32(MS_LD256_F32(gamma_data + c + C8NUM), deno1); // deno1 * gamma_data[c]
MS_FLOAT32X8 betav = MS_LD256_F32(beta_data + c);
MS_FLOAT32X8 betav1 = MS_LD256_F32(beta_data + c + C8NUM);
MS_FLOAT32X8 betav = MS_LD256_F32(beta_data + c), betav1 = MS_LD256_F32(beta_data + c + C8NUM);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM);
MS_FLOAT32X8 srcv1 = MS_LD256_F32(src1 + index * C8NUM);
MS_FLOAT32X8 outv = MS_SUB256_F32(srcv, mean);
MS_FLOAT32X8 outv1 = MS_SUB256_F32(srcv1, mean1);
MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM), srcv1 = MS_LD256_F32(src1 + index * C8NUM);
MS_FLOAT32X8 outv = MS_SUB256_F32(srcv, mean), outv1 = MS_SUB256_F32(srcv1, mean1);
outv = MS_MUL256_F32(outv, gammav);
outv1 = MS_MUL256_F32(outv1, gammav1);
outv = MS_ADD256_F32(outv, betav);
@@ -375,46 +319,42 @@ int InstanceNormNC8HW8(const float *src_data, float *dst_data, const float *gamm
for (; c <= channel_end - C8NUM; c += C8NUM) {
const float *src = src_b + c * hw_plane;
float *dst = dst_b + c;
MS_FLOAT32X8 mean = MS_MOV256_F32(0.0f);
MS_FLOAT32X8 square_mean = MS_MOV256_F32(0.0f);
MS_FLOAT32X8 mean = MS_MOV256_F32(0.0f), squ_m = MS_MOV256_F32(0.0f);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM);
MS_FLOAT32X8 squarev = MS_MUL256_F32(srcv, srcv);
mean = MS_ADD256_F32(mean, srcv);
square_mean = MS_ADD256_F32(square_mean, squarev);
squ_m = MS_ADD256_F32(squ_m, squarev);
}
mean = MS_DIV256_F32(mean, hw_planev);
square_mean = MS_DIV256_F32(square_mean, hw_planev);
MS_FLOAT32X8 deno = MS_ADD256_F32(MS_SUB256_F32(square_mean, MS_MUL256_F32(mean, mean)),
squ_m = MS_DIV256_F32(squ_m, hw_planev);
MS_FLOAT32X8 deno = MS_ADD256_F32(MS_SUB256_F32(squ_m, MS_MUL256_F32(mean, mean)),
MS_MOV256_F32(param->epsilon_)); // 256uestion
deno = MS_DIV256_F32(MS_MOV256_F32(1.0f), MS_SQRTFX8_F32(deno));

MS_FLOAT32X8 gammav = MS_MUL256_F32(MS_LD256_F32(gamma_data + c), deno); // deno * gamma_data[c]
MS_FLOAT32X8 betav = MS_LD256_F32(beta_data + c);
for (int index = 0; index < hw_plane; ++index) {
MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM);
MS_FLOAT32X8 outv = MS_SUB256_F32(srcv, mean);
MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM), outv = MS_SUB256_F32(srcv, mean);
outv = MS_MUL256_F32(outv, gammav);
outv = MS_ADD256_F32(outv, betav);
MS_ST256_F32(dst + index * channel, outv);
}
}
for (; c < channel_end; ++c) {
int c8_down_loop = c / C8NUM * C8NUM;
int c8_mod = c % C8NUM;
int c8_down_loop = c / C8NUM * C8NUM, c8_mod = c % C8NUM;
int c_res = MSMIN(channel_end - c8_down_loop, C8NUM);
const float *src = src_b + c8_down_loop * hw_plane + c8_mod;
float *dst = dst_b + c;
float mean = 0.0f;
float square_mean = 0.0f;
float mean = 0.0f, squ_m = 0.0f;
for (int index = 0; index < hw_plane; ++index) {
float tmp = src[index * c_res];
mean += tmp;
square_mean += tmp * tmp;
squ_m += tmp * tmp;
}
mean /= (float)hw_plane;
square_mean /= (float)hw_plane;
const float deno = gamma_data[c] / sqrtf(square_mean - mean * mean + param->epsilon_);
squ_m /= (float)hw_plane;
const float deno = gamma_data[c] / sqrtf(squ_m - mean * mean + param->epsilon_);
for (int index = 0; index < hw_plane; ++index) {
dst[index * channel] = (src[index * c_res] - mean) * deno + beta_data[c];
}


+ 12
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/instance_norm_fp32.h View File

@@ -23,6 +23,18 @@
extern "C" {
#endif

#define MS_ADDQ_F32_VEC(in1, in2, in3, in4, v1, v2, v3, v4) \
in1 = MS_ADDQ_F32(in1, v1); \
in2 = MS_ADDQ_F32(in2, v2); \
in3 = MS_ADDQ_F32(in3, v3); \
in4 = MS_ADDQ_F32(in4, v4);

#define MS_DIVQ_F32_VEC(in1, in2, in3, in4, v) \
in1 = MS_DIVQ_F32(in1, v); \
in2 = MS_DIVQ_F32(in2, v); \
in3 = MS_DIVQ_F32(in3, v); \
in4 = MS_DIVQ_F32(in4, v);

int InstanceNorm(const float *src_data, float *dst_data, const float *gamma_data, const float *beta_data,
const InstanceNormParameter *param, size_t task_id);
int InstanceNormNC4HW4(const float *src_data, float *dst_data, const float *gamma_data, const float *beta_data,


+ 4
- 9
mindspore/core/mindrt/include/async/collect.h View File

@@ -88,19 +88,14 @@ class Collected {

template <typename T>
inline Future<std::list<T>> Collect(const std::list<Future<T>> &futures) {
if (futures.empty()) {
return Future<std::list<T>>(std::list<T>());
}
if (futures.empty()) return Future<std::list<T>>(std::list<T>());

Promise<std::list<T>> *promise = new (std::nothrow) Promise<std::list<T>>();
MINDRT_OOM_EXIT(promise);
using CollectType = Collected<T>;
std::shared_ptr<CollectType> collect = std::make_shared<CollectType>(futures, promise);
std::shared_ptr<Collected<T>> collect = std::make_shared<Collected<T>>(futures, promise);

//
auto iter = futures.begin();
for (; iter != futures.end(); ++iter) {
iter->OnComplete(Defer(collect, &CollectType::Waited, std::placeholders::_1));
for (auto iter = futures.begin(); iter != futures.end(); ++iter) {
iter->OnComplete(Defer(collect, &Collected<T>::Waited, std::placeholders::_1));
}

Future<std::list<T>> future = promise->GetFuture();


+ 2
- 2
mindspore/core/mindrt/src/async/uuid_base.cc View File

@@ -43,7 +43,7 @@ Option<uuid> uuid::FromBytes(const std::string &s) {
return Option<uuid>(MindrtNone());
}
uuid u;
memcpy(&u.uuidData, s.data(), s.size());
(void)memcpy(&u.uuidData, s.data(), s.size());
return Option<uuid>(u);
}

@@ -154,7 +154,7 @@ uuid RandomBasedGenerator::GenerateRandomUuid() {
static std::atomic<uint64_t> ul(1);
uint64_t lCount = ul.fetch_add(1);
uint64_t offSet = distribution(gen) % RIGHT_SHIFT_BITS;
memcpy(tmpUUID.BeginAddress() + offSet, &lCount, sizeof(lCount));
(void)memcpy(tmpUUID.BeginAddress() + offSet, &lCount, sizeof(lCount));

// set the variant
*(tmpUUID.BeginAddress() + VARIANT_BIT_OFFSET) &= 0xBF;


+ 33
- 0
mindspore/core/ops/splice.h View File

@@ -25,18 +25,51 @@
namespace mindspore {
namespace ops {
constexpr auto kNameSplice = "Splice";
/// \brief All defined All operator prototype of lite.
class MS_CORE_API Splice : public PrimitiveC {
public:
/// \brief Constructor.
Splice() : PrimitiveC(kNameSplice) { InitIOName({"inputs"}, {"outputs"}); }

/// \brief Destructor.
~Splice() = default;
MS_DECLARE_PARENT(Splice, PrimitiveC);

/// \brief Method to init the op's attributes.
///
/// \param[in] contexts Define the contexts.
/// \param[in] forward_indexes Define the forward indexes.
/// \param[in] output_dims Define the output dims.
void Init(const std::vector<int64_t> &contexts, const std::vector<int64_t> &forward_indexes, int64_t output_dims);

/// \brief Method to set contexts attributes.
///
/// \param[in] contexts Define the contexts.
void set_context(const std::vector<int64_t> &contexts);

/// \brief Method to set forward_indexes attributes.
///
/// \param[in] forward_indexes Define the forward_indexes.
void set_forward_indexes(const std::vector<int64_t> &forward_indexes);

/// \brief Method to set output_dim attributes.
///
/// \param[in] output_dim Define the output_dim.
void set_output_dim(int64_t output_dim);

/// \brief Method to set context attributes.
///
/// \param[in] context Define the context.
std::vector<int64_t> get_context() const;

/// \brief Method to set forward_indexes attributes.
///
/// \param[in] forward_indexes Define the forward_indexes.
std::vector<int64_t> get_forward_indexes() const;

/// \brief Method to set output_dim attributes.
///
/// \param[in] output_dim Define the output_dim.
int64_t get_output_dim() const;
AbstractBasePtr SpliceInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args);


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc View File

@@ -320,7 +320,7 @@ int ArithmeticCPUKernel::BatchScalarCalc(int task_id) {
if (break_pos_ < 1) {
return RET_ERROR;
}
if (break_pos_ > MAX_ARITHMETIC_DIMS_SIZE || param_->out_strides_[break_pos_ - 1] == 0) {
if (break_pos_ > ARITHMETIC_SUPPORT_DIMS_NUM || param_->out_strides_[break_pos_ - 1] == 0) {
MS_LOG(ERROR) << "param_->out_strides_[break_pos_ - 1] is 0 or break_pos_ is > 10";
return RET_ERROR;
}
@@ -351,7 +351,7 @@ int ArithmeticCPUKernel::BatchScalarCalc(int task_id) {
}

int ArithmeticCPUKernel::BiasCalc(int task_id) {
if (param_->ndim_ > MAX_ARITHMETIC_DIMS_SIZE || param_->out_shape_[param_->ndim_ - 1] == 0) {
if (param_->ndim_ > ARITHMETIC_SUPPORT_DIMS_NUM || param_->out_shape_[param_->ndim_ - 1] == 0) {
MS_LOG(ERROR) << "BiasCalc param is error!";
return RET_ERROR;
}


+ 0
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h View File

@@ -39,7 +39,6 @@ using mindspore::schema::PrimitiveType_NotEqual;
using mindspore::schema::PrimitiveType_RealDiv;
using mindspore::schema::PrimitiveType_SquaredDifference;
using mindspore::schema::PrimitiveType_SubFusion;
#define MAX_ARITHMETIC_DIMS_SIZE 20

namespace mindspore::kernel {
class ArithmeticCPUKernel : public InnerKernel {


+ 7
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.cc View File

@@ -79,8 +79,13 @@ int GroupConvolutionFp32CPUKernel::Prepare() {
MS_LOG(ERROR) << "GetSingleConv for fp32 group conv failed.";
return lite::RET_ERROR;
}
group_convs_.emplace_back(new (std::nothrow) ConvolutionDelegateCPUKernel(
reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx_));
auto new_conv = new (std::nothrow)
ConvolutionDelegateCPUKernel(reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx_);
if (new_conv == nullptr) {
MS_LOG(ERROR) << "malloc new conv error.";
return lite::RET_ERROR;
}
(void)group_convs_.emplace_back(new_conv);
}
return GroupConvolutionBaseCPUKernel::Prepare();
}


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.cc View File

@@ -380,7 +380,7 @@ int LstmCPUKernel::LstmUnidirectional(float *output, const float *weight_i, cons
return RET_OK;
}

void LstmCPUKernel::RecordStates(float *cell_state, int step) {
void LstmCPUKernel::RecordStates(const float *cell_state, int step) {
float *workspace = reinterpret_cast<float *>(out_tensors_[kWorkspaceOutIdx]->MutableData());
workspace[step] = *cell_state;
}


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/lstm_fp32.h View File

@@ -49,7 +49,7 @@ class LstmCPUKernel : public InnerKernel {
int LstmUnidirectional(float *output, const float *weight_i, const float *weight_h, const float *input_bias,
const float *state_bias, float *hidden_state, float *cell_state, bool is_backward);
int InnerExecute(float *output, const float *input, float *hidden_state, float *cell_state);
void RecordStates(float *cell_state, int step);
void RecordStates(const float *cell_state, int step);
const float *weight_loop_;
const float *bias_loop_;
float *gate_loop_;


+ 6
- 6
mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.cc View File

@@ -138,16 +138,16 @@ int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int ba
auto cand = sorted_candidates.top();
bool selected = true;
auto IoUSuppressed = [this, &cand](const NMSBox &box) {
float intersec_x1 = std::max(cand.x1_, box.x1_);
float intersec_x2 = std::min(cand.x2_, box.x2_);
float intersec_y1 = std::max(cand.y1_, box.y1_);
float intersec_y2 = std::min(cand.y2_, box.y2_);
float intersec_x1 = std::max(cand.get_x1(), box.get_x1());
float intersec_x2 = std::min(cand.get_x2(), box.get_x2());
float intersec_y1 = std::max(cand.get_y1(), box.get_y1());
float intersec_y2 = std::min(cand.get_y2(), box.get_y2());
const float intersec_area =
std::max(intersec_x2 - intersec_x1, 0.0f) * std::max(intersec_y2 - intersec_y1, 0.0f);
if (intersec_area <= 0.0f) {
return false;
}
const float intersec_over_union = intersec_area / (cand.area_ + box.area_ - intersec_area);
const float intersec_over_union = intersec_area / (cand.get_area() + box.get_area() - intersec_area);
return intersec_over_union > this->iou_threshold_;
};
if (std::any_of(selected_box_per_class.begin(), selected_box_per_class.end(), IoUSuppressed)) {
@@ -156,7 +156,7 @@ int NonMaxSuppressionCPUKernel::Run_Selecte(bool simple_out, int box_num, int ba
if (selected) {
selected_box_per_class.push_back(cand);
selected_index.emplace_back(
NMSIndex{static_cast<int32_t>(i), static_cast<int32_t>(j), static_cast<int32_t>(cand.index_)});
NMSIndex{static_cast<int32_t>(i), static_cast<int32_t>(j), static_cast<int32_t>(cand.get_index())});
}
sorted_candidates.pop();
}


+ 9
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/non_max_suppression_fp32.h View File

@@ -84,6 +84,15 @@ class NMSBox {
}

public:
const float get_score() const { return score_; }
const int get_index() const { return index_; }
const float get_y1() const { return y1_; }
const float get_y2() const { return y2_; }
const float get_x1() const { return x1_; }
const float get_x2() const { return x2_; }
const float get_area() const { return area_; }

private:
float score_;
int index_;
float y1_; // y1 x1 y2 x2 ascending order


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc View File

@@ -146,7 +146,7 @@ void PadCPUKernel::InitMirrorPadBlock() {
for (size_t i = 0; i < pad_region.size(); ++i) {
int di = size_offset + i;
int si = remain_dim_offset + i;
if (di > DEFAULT_PAD_NDIMS) {
if (di >= DEFAULT_PAD_NDIMS) {
continue;
}
switch (pad_cord[i]) {


Loading…
Cancel
Save