| @@ -31,99 +31,57 @@ int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); } | |||||
| int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); } | int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); } | ||||
| void ReluFp32(float *data, float *dst, int ele_num) { | void ReluFp32(float *data, float *dst, int ele_num) { | ||||
| int four_block = UP_DIV(ele_num, C4NUM); | |||||
| for (int i = 0; i < four_block - 1; i++) { | |||||
| int index = i * C4NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t relu_data = vld1q_f32(data + index); | |||||
| float32x4_t zero_data = vdupq_n_f32(0); | |||||
| relu_data = vmaxq_f32(relu_data, zero_data); | |||||
| vst1q_f32(dst + index, relu_data); | |||||
| #else | |||||
| data[index] = data[index] < 0 ? 0 : data[index]; | |||||
| data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; | |||||
| data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; | |||||
| data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; | |||||
| #endif | |||||
| } | |||||
| for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) { | |||||
| data[j] = data[j] < 0 ? 0 : data[j]; | |||||
| int index = 0; | |||||
| #ifdef ENABLE_AVX | |||||
| int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM; | |||||
| for (; index < c8_block; index += C8NUM) { | |||||
| MS_FLOAT32X8 relu_data = MS_LD256_F32(data + index); | |||||
| MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f); | |||||
| relu_data = MS_MAX256_F32(relu_data, zero_data); | |||||
| MS_ST256_F32(dst + index, relu_data); | |||||
| } | } | ||||
| } | |||||
| void Relu6Fp32(float *data, float *dst, int ele_num) { | |||||
| int four_block = UP_DIV(ele_num, C4NUM); | |||||
| for (int i = 0; i < four_block - 1; i++) { | |||||
| int index = i * C4NUM; | |||||
| #ifdef ENABLE_NEON | |||||
| float32x4_t relu6_data = vld1q_f32(data + index); | |||||
| float32x4_t zero_data = vdupq_n_f32(0); | |||||
| float32x4_t six_data = vdupq_n_f32(6); | |||||
| relu6_data = vmaxq_f32(relu6_data, zero_data); | |||||
| relu6_data = vminq_f32(relu6_data, six_data); | |||||
| vst1q_f32(dst + index, relu6_data); | |||||
| #else | |||||
| data[index] = data[index] < 0 ? 0 : data[index]; | |||||
| data[index] = data[index] > 6 ? 6 : data[index]; | |||||
| data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; | |||||
| data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1]; | |||||
| data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; | |||||
| data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2]; | |||||
| data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; | |||||
| data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3]; | |||||
| #endif | #endif | ||||
| #if defined(ENABLE_NEON) || defined(ENABLE_SSE) | |||||
| int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM; | |||||
| for (; index < c4_block; index += C4NUM) { | |||||
| MS_FLOAT32X4 relu_data = MS_LDQ_F32(data + index); | |||||
| MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f); | |||||
| relu_data = MS_MAXQ_F32(relu_data, zero_data); | |||||
| MS_STQ_F32(dst + index, relu_data); | |||||
| } | } | ||||
| for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) { | |||||
| data[j] = data[j] < 0 ? 0 : data[j]; | |||||
| data[j] = data[j] > 6 ? 6 : data[j]; | |||||
| #endif | |||||
| for (; index < ele_num; ++index) { | |||||
| data[index] = data[index] < 0.0f ? 0.0f : data[index]; | |||||
| } | } | ||||
| } | } | ||||
| void Relu6Fp32(float *data, float *dst, int ele_num) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_AVX | #ifdef ENABLE_AVX | ||||
| #ifdef WIN32 | |||||
| void ReluFp32C8(float *data, float *dst, int ele_num) { | |||||
| int four_block = UP_DIV(ele_num, C8NUM); | |||||
| for (int i = 0; i < four_block - 1; i++) { | |||||
| int index = i * C8NUM; | |||||
| data[index] = data[index] < 0 ? 0 : data[index]; | |||||
| data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; | |||||
| data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; | |||||
| data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; | |||||
| data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4]; | |||||
| data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5]; | |||||
| data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6]; | |||||
| data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7]; | |||||
| } | |||||
| for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) { | |||||
| data[j] = data[j] < 0 ? 0 : data[j]; | |||||
| int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM; | |||||
| for (; index < c8_block; index += C8NUM) { | |||||
| MS_FLOAT32X8 relu6_data = MS_LD256_F32(data + index); | |||||
| MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f); | |||||
| MS_FLOAT32X8 six_data = MS_MOV256_F32(6.0f); | |||||
| relu6_data = MS_MAX256_F32(relu6_data, zero_data); | |||||
| relu6_data = MS_MIN256_F32(relu6_data, six_data); | |||||
| MS_ST256_F32(dst + index, relu6_data); | |||||
| } | } | ||||
| } | |||||
| #endif | |||||
| void Relu6Fp32C8(float *data, float *dst, int ele_num) { | |||||
| int four_block = UP_DIV(ele_num, C8NUM); | |||||
| for (int i = 0; i < four_block - 1; i++) { | |||||
| int index = i * C8NUM; | |||||
| data[index] = data[index] < 0 ? 0 : data[index]; | |||||
| data[index] = data[index] > 6 ? 6 : data[index]; | |||||
| data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; | |||||
| data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1]; | |||||
| data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; | |||||
| data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2]; | |||||
| data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; | |||||
| data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3]; | |||||
| data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4]; | |||||
| data[index + 4] = data[index + 4] > 6 ? 6 : data[index + 4]; | |||||
| data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5]; | |||||
| data[index + 5] = data[index + 5] > 6 ? 6 : data[index + 5]; | |||||
| data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6]; | |||||
| data[index + 6] = data[index + 6] > 6 ? 6 : data[index + 6]; | |||||
| data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7]; | |||||
| data[index + 7] = data[index + 7] > 6 ? 6 : data[index + 7]; | |||||
| #if defined(ENABLE_NEON) || defined(ENABLE_SSE) | |||||
| int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM; | |||||
| for (; index < c4_block; index += C4NUM) { | |||||
| MS_FLOAT32X4 relu6_data = MS_LDQ_F32(data + index); | |||||
| MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f); | |||||
| MS_FLOAT32X4 six_data = MS_MOVQ_F32(6.0f); | |||||
| relu6_data = MS_MAXQ_F32(relu6_data, zero_data); | |||||
| relu6_data = MS_MINQ_F32(relu6_data, six_data); | |||||
| MS_STQ_F32(dst + index, relu6_data); | |||||
| } | } | ||||
| for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) { | |||||
| data[j] = data[j] < 0 ? 0 : data[j]; | |||||
| data[j] = data[j] > 6 ? 6 : data[j]; | |||||
| #endif | |||||
| for (; index < ele_num; ++index) { | |||||
| data[index] = data[index] < 0.0f ? 0.0f : data[index]; | |||||
| data[index] = data[index] > 6.0f ? 6.0f : data[index]; | |||||
| } | } | ||||
| } | } | ||||
| #endif | |||||
| #endif | |||||
| @@ -17,7 +17,7 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "schema/model_generated.h" | #include "schema/model_generated.h" | ||||
| #include "src/runtime/kernel/arm/fp16/bias_fp16.h" | |||||
| #include "src/runtime/kernel/arm/fp16/biasadd_fp16.h" | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | using mindspore::kernel::KERNEL_ARCH::kCPU; | ||||
| @@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_BiasAdd; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int BiasCPUFp16Kernel::ReSize() { | |||||
| int BiasAddCPUFp16Kernel::ReSize() { | |||||
| auto dims = in_tensors_.at(0)->shape(); | auto dims = in_tensors_.at(0)->shape(); | ||||
| bias_param_->ndim_ = dims.size(); | bias_param_->ndim_ = dims.size(); | ||||
| if (bias_param_->ndim_ < 1 || bias_param_->ndim_ > 5) { | if (bias_param_->ndim_ < 1 || bias_param_->ndim_ > 5) { | ||||
| @@ -45,13 +45,20 @@ int BiasCPUFp16Kernel::ReSize() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int BiasCPUFp16Kernel::Run() { | |||||
| int BiasAddCPUFp16Kernel::Run() { | |||||
| if (bias_data_ == nullptr) { | |||||
| auto ret = GetBiasData(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "GetBiasData is error in run!"; | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->MutableData()); | auto in = reinterpret_cast<float16_t *>(in_tensors_.at(0)->MutableData()); | ||||
| auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData()); | auto out = reinterpret_cast<float16_t *>(out_tensors_.at(0)->MutableData()); | ||||
| size_t data_size = in_tensors_.at(0)->ElementsNum(); | size_t data_size = in_tensors_.at(0)->ElementsNum(); | ||||
| MS_ASSERT(context_->allocator != nullptr); | MS_ASSERT(context_->allocator != nullptr); | ||||
| auto *tile_in = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t))); | |||||
| auto *tile_bias = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t))); | |||||
| auto tile_in = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t))); | |||||
| auto tile_bias = reinterpret_cast<float16_t *>(context_->allocator->Malloc(data_size * sizeof(float16_t))); | |||||
| if (tile_in == nullptr || tile_bias == nullptr) { | if (tile_in == nullptr || tile_bias == nullptr) { | ||||
| MS_LOG(ERROR) << "Memory allocation failed"; | MS_LOG(ERROR) << "Memory allocation failed"; | ||||
| context_->allocator->Free(tile_in); | context_->allocator->Free(tile_in); | ||||
| @@ -64,43 +71,54 @@ int BiasCPUFp16Kernel::Run() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| BiasCPUFp16Kernel::~BiasCPUFp16Kernel() { | |||||
| BiasAddCPUFp16Kernel::~BiasAddCPUFp16Kernel() { | |||||
| if ((bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) && bias_data_ != nullptr) { | if ((bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) && bias_data_ != nullptr) { | ||||
| free(bias_data_); | free(bias_data_); | ||||
| bias_data_ = nullptr; | bias_data_ = nullptr; | ||||
| } | } | ||||
| } | } | ||||
| int BiasCPUFp16Kernel::Init() { | |||||
| auto bias_tensor = in_tensors_.at(1); | |||||
| MS_ASSERT(bias_tensor != nullptr); | |||||
| bias_data_type_ = bias_tensor->data_type(); | |||||
| int BiasAddCPUFp16Kernel::GetBiasData() { | |||||
| bias_data_type_ = bias_tensor_->data_type(); | |||||
| if (bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) { | if (bias_data_type_ == kNumberTypeFloat || bias_data_type_ == kNumberTypeFloat32) { | ||||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(bias_tensor->ElementsNum() * sizeof(float16_t))); | |||||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(bias_tensor_->ElementsNum() * sizeof(float16_t))); | |||||
| if (bias_data_ == nullptr) { | if (bias_data_ == nullptr) { | ||||
| MS_LOG(ERROR) << "bias_data_ is nullptr"; | MS_LOG(ERROR) << "bias_data_ is nullptr"; | ||||
| return RET_NULL_PTR; | return RET_NULL_PTR; | ||||
| } | } | ||||
| auto *bias = reinterpret_cast<float *>(bias_tensor->MutableData()); | |||||
| auto bias = reinterpret_cast<float *>(bias_tensor_->MutableData()); | |||||
| if (bias == nullptr) { | if (bias == nullptr) { | ||||
| MS_LOG(ERROR) << "bias is nullptr!"; | MS_LOG(ERROR) << "bias is nullptr!"; | ||||
| return RET_NULL_PTR; | return RET_NULL_PTR; | ||||
| } | } | ||||
| for (int i = 0; i < bias_tensor->ElementsNum(); ++i) { | |||||
| for (int i = 0; i < bias_tensor_->ElementsNum(); ++i) { | |||||
| bias_data_[i] = (float16_t)(bias[i]); | bias_data_[i] = (float16_t)(bias[i]); | ||||
| } | } | ||||
| } else { | } else { | ||||
| bias_data_ = reinterpret_cast<float16_t *>(bias_tensor->MutableData()); | |||||
| bias_data_ = reinterpret_cast<float16_t *>(bias_tensor_->MutableData()); | |||||
| if (bias_data_ == nullptr) { | if (bias_data_ == nullptr) { | ||||
| MS_LOG(ERROR) << "bias_data_ is nullptr"; | MS_LOG(ERROR) << "bias_data_ is nullptr"; | ||||
| return RET_NULL_PTR; | return RET_NULL_PTR; | ||||
| } | } | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | |||||
| int BiasAddCPUFp16Kernel::Init() { | |||||
| bias_tensor_ = in_tensors_.at(1); | |||||
| MS_ASSERT(bias_tensor_ != nullptr); | |||||
| if (bias_tensor_->IsConst()) { | |||||
| auto ret = GetBiasData(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "GetBiasData is error in Init()!"; | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator<BiasCPUFp16Kernel>) | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_BiasAdd, LiteKernelCreator<BiasAddCPUFp16Kernel>) | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -14,31 +14,33 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_ | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "nnacl/fp16/arithmetic_fp16.h" | #include "nnacl/fp16/arithmetic_fp16.h" | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class BiasCPUFp16Kernel : public LiteKernel { | |||||
| class BiasAddCPUFp16Kernel : public LiteKernel { | |||||
| public: | public: | ||||
| BiasCPUFp16Kernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | |||||
| BiasAddCPUFp16Kernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | |||||
| : LiteKernel(parameter, inputs, outputs, ctx) { | : LiteKernel(parameter, inputs, outputs, ctx) { | ||||
| bias_param_ = reinterpret_cast<ArithmeticParameter *>(parameter); | bias_param_ = reinterpret_cast<ArithmeticParameter *>(parameter); | ||||
| } | } | ||||
| ~BiasCPUFp16Kernel() override; | |||||
| ~BiasAddCPUFp16Kernel() override; | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| private: | private: | ||||
| int GetBiasData(); | |||||
| ArithmeticParameter *bias_param_ = nullptr; | ArithmeticParameter *bias_param_ = nullptr; | ||||
| float16_t *bias_data_ = nullptr; | float16_t *bias_data_ = nullptr; | ||||
| lite::Tensor *bias_tensor_ = nullptr; | |||||
| TypeId bias_data_type_; | TypeId bias_data_type_; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIAS_H_ | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_BIASADD_H_ | |||||