From: @lzkcode Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tongpull/13581/MERGE
| @@ -118,3 +118,10 @@ int ElementReciprocalFp16(float16_t *input, float16_t *output, int element_size) | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| int ElementErfFp16(float16_t *input, float16_t *output, int element_size) { | |||
| for (int i = 0; i < element_size; i++) { | |||
| output[i] = erff(input[i]); | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| @@ -50,6 +50,8 @@ int ElementCeilFp16(float16_t *input, float16_t *output, int number); | |||
| int ElementNegativeFp16(float16_t *input, float16_t *output, int element_size); | |||
| int ElementReciprocalFp16(float16_t *input, float16_t *output, int element_size); | |||
| int ElementErfFp16(float16_t *input, float16_t *output, int element_size); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,33 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "nnacl/fp16/exp_fp16.h" | |||
| #include <math.h> | |||
| #include <string.h> | |||
| #include "nnacl/errorcode.h" | |||
| void ExpFp16(const float16_t *src, float16_t *dst, int num) { | |||
| int i = 0; | |||
| #ifdef ENABLE_ARM64 | |||
| int count = (num / C8NUM) * C8NUM; | |||
| for (; i < count; i += C8NUM) { | |||
| simd_exp_fp16(vld1q_f16(src + i), dst + i); | |||
| } | |||
| #endif | |||
| for (; i < num; ++i) { | |||
| single_exp_fp16(src[i], dst + i); | |||
| } | |||
| } | |||
| @@ -0,0 +1,70 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_NNACL_FP16_EXP_H_ | |||
| #define MINDSPORE_LITE_NNACL_FP16_EXP_H_ | |||
| #include "nnacl/op_base.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| void ExpFp16(const float16_t *src, float16_t *dst, int num); | |||
| #if defined(ENABLE_ARM64) | |||
| static inline float32x4_t exp_fp32(float32x4_t input) { | |||
| static float32x4_t param[] = {{0.693147f, 0.693147f, 0.693147f, 0.693147f}, | |||
| {1.0f / 120, 1.0f / 120, 1.0f / 120, 1.0f / 120}, | |||
| {1.0f / 24, 1.0f / 24, 1.0f / 24, 1.0f / 24}, | |||
| {1.0f / 6, 1.0f / 6, 1.0f / 6, 1.0f / 6}, | |||
| {0.5f, 0.5f, 0.5f, 0.5f}, | |||
| {1.0f, 1.0f, 1.0f, 1.0f}}; | |||
| int32x4_t integer = vcvtq_s32_f32(input / param[0]); | |||
| float32x4_t decimal = input - vcvtq_f32_s32(integer) * param[0]; | |||
| int32x4_t int_exp = vshlq_s32((integer + vmovq_n_s32(127)), vmovq_n_s32(23)); | |||
| float32x4_t decimal_exp = | |||
| param[5] + | |||
| decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1])))); | |||
| decimal_exp = decimal_exp * vld1q_f32((float *)(&int_exp)); | |||
| return decimal_exp; | |||
| } | |||
| static inline void simd_exp_fp16(float16x8_t input, float16_t *dst) { | |||
| static float16x8_t maxv = {88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f}; | |||
| static float16x8_t minv = {-88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f}; | |||
| input = vmaxq_f16(minv, vminq_f16(input, maxv)); | |||
| float32x4_t input_low = vcvt_f32_f16(vget_low_f16(input)); | |||
| float32x4_t input_high = vcvt_high_f32_f16(input); | |||
| vst1q_f16(dst, vcombine_f16(vcvt_f16_f32(exp_fp32(input_low)), vcvt_f16_f32(exp_fp32(input_high)))); | |||
| } | |||
| #endif | |||
| static inline void single_exp_fp16(float16_t src, float16_t *dst) { | |||
| static float param[] = {0.693147f, 1.0f / 120, 1.0f / 24, 1.0f / 6, 1.0f / 2, 1.0f}; | |||
| src = MSMAX(-88.0f, MSMIN(88.0f, src)); | |||
| int integer = (float)src / param[0]; | |||
| float decimal = (float)src - integer * param[0]; | |||
| int int_exp = (integer + 127) << 23; | |||
| float decimal_exp = | |||
| 1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1])))); | |||
| *dst = (float16_t)(*((float *)&int_exp) * decimal_exp); | |||
| } | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif // MINDSPORE_LITE_NNACL_FP16_EXP_H_ | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -16,7 +16,79 @@ | |||
| #include "nnacl/fp16/softmax_fp16.h" | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #include "nnacl/fp16/exp_fp16.h" | |||
| void SoftmaxNormFp16(const float16_t *src, float16_t *dst, int batch, int channel) { | |||
| int cur_batch_offset = 0; | |||
| for (int i = 0; i < batch; i++, cur_batch_offset += channel) { | |||
| int j = 0; | |||
| #ifdef ENABLE_ARM64 | |||
| float16x8_t max_8 = vdupq_n_f16(-FLT16_MAX); | |||
| int count = (channel / C8NUM) * C8NUM; | |||
| for (; j < count; j += C8NUM) { | |||
| float16x8_t input_8 = vld1q_f16(src + cur_batch_offset + j); | |||
| max_8 = vmaxq_f16(max_8, input_8); | |||
| } | |||
| float16_t max = vmaxvq_f16(max_8); | |||
| #else | |||
| float16_t max = -FLT_MAX; | |||
| #endif | |||
| for (; j < channel; j++) { | |||
| float16_t input = src[cur_batch_offset + j]; | |||
| if (input > max) { | |||
| max = input; | |||
| } | |||
| } | |||
| int k = 0; | |||
| #ifdef ENABLE_NEON | |||
| int count2 = (channel / C8NUM) * C8NUM; | |||
| for (; k < count2; k += C8NUM) { | |||
| float16x8_t input_8 = vld1q_f16(src + cur_batch_offset + k); | |||
| float16x8_t output_8 = vsubq_f16(input_8, vdupq_n_f16(max)); | |||
| vst1q_f16(dst + cur_batch_offset + k, output_8); | |||
| } | |||
| #endif | |||
| for (; k < channel; k++) { | |||
| int offset = cur_batch_offset + k; | |||
| dst[offset] = src[offset] - max; | |||
| } | |||
| } | |||
| } | |||
| void SumAndDivFp16(const float16_t *src, float16_t *dst, int batch, int channel) { | |||
| int cur_batch_offset = 0; | |||
| for (int i = 0; i < batch; i++, cur_batch_offset += channel) { | |||
| float16_t sum = 0; | |||
| int j = 0; | |||
| #ifdef ENABLE_NEON | |||
| float16x8_t sum8 = vdupq_n_f16(0); | |||
| int count = (channel / C8NUM) * C8NUM; | |||
| for (; j < count; j += C8NUM) { | |||
| sum8 = vaddq_f16(sum8, vld1q_f16(src + cur_batch_offset + j)); | |||
| } | |||
| sum = sum8[0] + sum8[1] + sum8[2] + sum8[3] + sum8[4] + sum8[5] + sum8[6] + sum8[7]; | |||
| #endif | |||
| for (; j < channel; j++) { | |||
| sum += src[cur_batch_offset + j]; | |||
| } | |||
| int k = 0; | |||
| #ifdef ENABLE_NEON | |||
| const float16_t div = 1.0f / sum; | |||
| for (; k < count; k += C8NUM) { | |||
| vst1q_f16(dst + cur_batch_offset + k, vmulq_n_f16(vld1q_f16(src + cur_batch_offset + k), div)); | |||
| } | |||
| #endif | |||
| for (; k < channel; k++) { | |||
| dst[cur_batch_offset + k] = src[cur_batch_offset + k] / sum; | |||
| } | |||
| } | |||
| } | |||
| void SoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, int batch, int channel) { | |||
| SoftmaxNormFp16(src, dst, batch, channel); | |||
| ExpFp16(dst, dst, batch * channel); | |||
| SumAndDivFp16(dst, dst, batch, channel); | |||
| } | |||
| // output = exp(input) / reduce_sum(exp(input), axis) | |||
| void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter) { | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -26,6 +26,7 @@ | |||
| extern "C" { | |||
| #endif | |||
| void SoftmaxFp16(const float16_t *input_ptr, float16_t *output_ptr, float16_t *sum_data, SoftmaxParameter *parameter); | |||
| void SoftmaxLastAxisFp16(const float16_t *src, float16_t *dst, int batch, int channel); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| @@ -53,11 +53,10 @@ static inline void simd_exp(MS_FLOAT32X4 input, float *dst) { | |||
| MS_INT32X4 integer = MS_CVTQPS_EPI32(input / param[0]); | |||
| MS_FLOAT32X4 decimal = input - MS_CVTQEPI32_PS(integer) * param[0]; | |||
| MS_INT32X4 int_exp = MS_SLLIQ_EPI32(MS_ADDQ_EPI32(integer, MS_MOVQ_EPI32(127)), 23); | |||
| memcpy(dst, &int_exp, sizeof(int32_t) * 4); | |||
| MS_FLOAT32X4 decimal_exp = | |||
| param[5] + | |||
| decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1])))); | |||
| MS_STQ_F32(dst, decimal_exp * MS_LDQ_F32(dst)); | |||
| MS_STQ_F32(dst, decimal_exp * MS_LDQ_F32((float *)(&int_exp))); | |||
| } | |||
| #endif | |||
| @@ -76,11 +75,10 @@ static inline void simd_exp_avx(MS_FLOAT32X8 input, float *dst) { | |||
| MS_INT32X8 integer = MS_CVT256PS_EPI32(input / param[0]); | |||
| MS_FLOAT32X8 decimal = input - MS_CVT256EPI32_PS(integer) * param[0]; | |||
| MS_INT32X8 int_exp = MS_SLLI256_EPI32(MS_ADD256_EPI32(integer, MS_MOV256_EPI32(127)), 23); | |||
| memcpy(dst, &int_exp, sizeof(int32_t) * 8); | |||
| MS_FLOAT32X8 decimal_exp = | |||
| param[5] + | |||
| decimal * (param[5] + decimal * (param[4] + decimal * (param[3] + decimal * (param[2] + decimal * param[1])))); | |||
| MS_ST256_F32(dst, decimal_exp * MS_LD256_F32(dst)); | |||
| MS_ST256_F32(dst, decimal_exp * MS_LD256_F32((float *)(&int_exp))); | |||
| } | |||
| #endif | |||
| @@ -90,10 +88,10 @@ static inline void single_exp(float src, float *dst) { | |||
| int integer = src / param[0]; | |||
| float decimal = src - integer * param[0]; | |||
| int int_exp = (integer + 127) << 23; | |||
| memcpy(dst, &int_exp, sizeof(float)); | |||
| const float decimal_exp = | |||
| float decimal_exp = | |||
| 1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1])))); | |||
| *dst *= decimal_exp; | |||
| float *ptr = (float *)&int_exp; | |||
| *dst = *ptr * decimal_exp; | |||
| } | |||
| #ifdef __cplusplus | |||
| } | |||
| @@ -22,14 +22,21 @@ void SoftmaxNorm(const float *src, float *dst, int batch, int channel) { | |||
| int cur_batch_offset = 0; | |||
| for (int i = 0; i < batch; i++, cur_batch_offset += channel) { | |||
| int j = 0; | |||
| #ifdef ENABLE_ARM64 | |||
| #ifdef ENABLE_NEON | |||
| float32x4_t max4 = vdupq_n_f32(-FLT_MAX); | |||
| int count = (channel / C4NUM) * C4NUM; | |||
| for (; j < count; j += C4NUM) { | |||
| float32x4_t input4 = vld1q_f32(src + cur_batch_offset + j); | |||
| max4 = vmaxq_f32(max4, input4); | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| float max = vmaxvq_f32(max4); | |||
| #else | |||
| float max = max4[0]; | |||
| for (int m = 1; m < 4; ++m) { | |||
| max = MSMAX(max, max4[m]); | |||
| } | |||
| #endif | |||
| #else | |||
| float max = -FLT_MAX; | |||
| #endif | |||
| @@ -66,7 +73,11 @@ void SumAndDiv(const float *src, float *dst, int batch, int channel) { | |||
| for (; j < count; j += C4NUM) { | |||
| sum4 = vaddq_f32(sum4, vld1q_f32(src + cur_batch_offset + j)); | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| sum = vaddvq_f32(sum4); | |||
| #else | |||
| sum = sum4[0] + sum4[1] + sum4[2] + sum4[3]; | |||
| #endif | |||
| #endif | |||
| for (; j < channel; j++) { | |||
| sum += src[cur_batch_offset + j]; | |||
| @@ -61,6 +61,7 @@ | |||
| #define kInputSize2 3 | |||
| #define MAX_AXIS_SIZE 6 | |||
| #define MAX_LEN 256 | |||
| #define FLT16_MAX 65504 | |||
| typedef enum LiteDataType { | |||
| kDataTypeFloat, | |||
| @@ -43,7 +43,8 @@ ArithmeticSelfFp16Func ArithmeticSelfFp16CPUKernel::GetArithmeticSelfFp16Fun(int | |||
| {mindspore::schema::PrimitiveType_Ceil, ElementCeilFp16}, | |||
| {mindspore::schema::PrimitiveType_Round, ElementRoundFp16}, | |||
| {mindspore::schema::PrimitiveType_Neg, ElementNegativeFp16}, | |||
| {mindspore::schema::PrimitiveType_Reciprocal, ElementReciprocalFp16}}; | |||
| {mindspore::schema::PrimitiveType_Reciprocal, ElementReciprocalFp16}, | |||
| {mindspore::schema::PrimitiveType_Erf, ElementErfFp16}}; | |||
| for (size_t i = 0; i < sizeof(type_func_table) / sizeof(TYPE_FUNC_INFO); i++) { | |||
| if (type_func_table[i].primitive_type_ == primitive_type) { | |||
| return type_func_table[i].func_; | |||
| @@ -98,4 +99,5 @@ REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Ceil, LiteKernelCreator<Arith | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Round, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Neg, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Reciprocal, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Erf, LiteKernelCreator<ArithmeticSelfFp16CPUKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -43,59 +43,74 @@ int SoftmaxFp16CPUKernel::Init() { | |||
| return ReSize(); | |||
| } | |||
| int SoftmaxFp16CPUKernel::ReSize() { return SoftmaxBaseCPUKernel::ReSize(); } | |||
| int SoftmaxFp16CPUKernel::MallocTmpBuffer() { | |||
| int SoftmaxFp16CPUKernel::ReSize() { | |||
| auto ret = SoftmaxBaseCPUKernel::ReSize(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| auto n_dim = softmax_param_->n_dim_; | |||
| auto axis = softmax_param_->axis_; | |||
| if (axis == -1) { | |||
| softmax_param_->axis_ += n_dim; | |||
| axis = softmax_param_->axis_; | |||
| } | |||
| auto in_shape = in_tensors_.front()->shape(); | |||
| int out_plane_size = 1; | |||
| out_plane_size_ = 1; | |||
| for (int i = 0; i < axis; ++i) { | |||
| out_plane_size *= in_shape[i]; | |||
| out_plane_size_ *= in_shape[i]; | |||
| } | |||
| int in_plane_size = 1; | |||
| in_plane_size_ = 1; | |||
| for (int i = axis + 1; i < n_dim; i++) { | |||
| in_plane_size *= in_shape[i]; | |||
| in_plane_size_ *= in_shape[i]; | |||
| } | |||
| sum_data_ = | |||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(out_plane_size * in_plane_size * sizeof(float16_t))); | |||
| if (sum_data_ != nullptr) { | |||
| free(sum_data_); | |||
| } | |||
| sum_data_ = reinterpret_cast<float16_t *>(malloc(out_plane_size_ * in_plane_size_ * sizeof(float16_t))); | |||
| if (sum_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc data for softmax fail!"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float16_t)); | |||
| return RET_OK; | |||
| } | |||
| void SoftmaxFp16CPUKernel::FreeTmpBuffer() { | |||
| if (sum_data_ != nullptr) { | |||
| context_->allocator->Free(sum_data_); | |||
| sum_data_ = nullptr; | |||
| } | |||
| int SoftmaxFp16CPUKernel::DoSoftmaxLastAxis(int task_id) { | |||
| int unit = UP_DIV(out_plane_size_, context_->thread_num_); | |||
| int begin = task_id * unit; | |||
| int end = MSMIN(begin + unit, out_plane_size_); | |||
| int channel = softmax_param_->input_shape_[softmax_param_->axis_]; | |||
| int offset = begin * channel; | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(kInputIndex)->MutableData()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->MutableData()); | |||
| SoftmaxLastAxisFp16(input_ptr + offset, output_ptr + offset, end - begin, channel); | |||
| return RET_OK; | |||
| } | |||
| int SoftmaxFp16CPUKernel::Run() { | |||
| auto ret = MallocTmpBuffer(); | |||
| int SoftmaxLastAxisFp16Run(void *cdata, int task_id) { | |||
| auto kernel = reinterpret_cast<SoftmaxFp16CPUKernel *>(cdata); | |||
| auto ret = kernel->DoSoftmaxLastAxis(task_id); | |||
| if (ret != RET_OK) { | |||
| FreeTmpBuffer(); | |||
| MS_LOG(ERROR) << "MallocTmpBuffer failed"; | |||
| return RET_ERROR; | |||
| MS_LOG(ERROR) << "DoSoftmaxLastAxisFp16 error task_id: " << task_id << ", ret: " << ret; | |||
| } | |||
| return ret; | |||
| } | |||
| int SoftmaxFp16CPUKernel::Run() { | |||
| auto input_tensor = in_tensors_.at(0); | |||
| MS_ASSERT(input_tensor); | |||
| auto output_tensor = out_tensors_.at(0); | |||
| MS_ASSERT(output_tensor); | |||
| input_fp16_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||
| MS_ASSERT(input_fp16_); | |||
| output_fp16_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||
| SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_); | |||
| FreeTmpBuffer(); | |||
| MS_ASSERT(output_fp16_); | |||
| if (in_plane_size_ == 1) { | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, SoftmaxLastAxisFp16Run, this, context_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "SoftmaxFp16CPUKernel ParallelLaunch failed, ret: " << ret; | |||
| } | |||
| return ret; | |||
| } else { | |||
| MS_ASSERT(sum_data_); | |||
| memset(sum_data_, 0, out_plane_size_ * in_plane_size_ * sizeof(float16_t)); | |||
| SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -28,18 +28,25 @@ class SoftmaxFp16CPUKernel : public SoftmaxBaseCPUKernel { | |||
| SoftmaxFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | |||
| : SoftmaxBaseCPUKernel(parameter, inputs, outputs, ctx), sum_data_(nullptr) {} | |||
| ~SoftmaxFp16CPUKernel() = default; | |||
| ~SoftmaxFp16CPUKernel() override { | |||
| if (sum_data_ != nullptr) { | |||
| free(sum_data_); | |||
| } | |||
| } | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int MallocTmpBuffer(); | |||
| void FreeTmpBuffer(); | |||
| int DoSoftmaxLastAxis(int task_id); | |||
| private: | |||
| float16_t *sum_data_ = nullptr; | |||
| float16_t *input_fp16_ = nullptr; | |||
| float16_t *output_fp16_ = nullptr; | |||
| int in_plane_size_ = 0; | |||
| int out_plane_size_ = 0; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -62,3 +62,5 @@ ml_video_edit_oneclick_adaptis.pb 3 | |||
| # Q_hand_0812.pb is not suitable for float16. Out of float16 range. | |||
| Q_hand_0812.pb | |||
| tacotron_encoder_stf.pb 5;1:1,62:1,62:1,62:1,62 | |||
| Q_inception-249970-672-11-16.pb 1 | |||
| Q_dila-small-mix-full-fineturn-390000-nopixel-nosigmoid.pb 1 | |||