From: @ling_qiao_min Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -15,31 +15,22 @@ | |||||
| */ | */ | ||||
| #include "nnacl/base/gather_base.h" | #include "nnacl/base/gather_base.h" | ||||
| int GatherFp32(const float *input, int outer_size, int inner_size, int limit, const int *indices, | |||||
| int indices_element_size, float *output) { | |||||
| for (int m = 0; m < outer_size; ++m) { | |||||
| const float *inputm = input + inner_size * m * limit; | |||||
| float *outputm = output + inner_size * m * indices_element_size; | |||||
| for (int i = 0; i < indices_element_size; ++i) { | |||||
| if (indices[i] < 0 || indices[i] > limit) { | |||||
| return NNACL_ERR; | |||||
| } | |||||
| memcpy(outputm + i * inner_size, inputm + indices[i] * inner_size, sizeof(float) * inner_size); | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int GatherInt32(const int32_t *input, int outer_size, int inner_size, int limit, const int *indices, | |||||
| int indices_element_size, int32_t *output) { | |||||
| int Gather(const void *input, int outer_size, int inner_size, int limit, const int *indices, int indices_element_size, | |||||
| void *output, int data_size) { | |||||
| const int8_t *int8_in = (int8_t *)input; | |||||
| int8_t *int8_out = (int8_t *)output; | |||||
| for (int m = 0; m < outer_size; ++m) { | for (int m = 0; m < outer_size; ++m) { | ||||
| const int32_t *inputm = input + inner_size * m * limit; | |||||
| int32_t *outputm = output + inner_size * m * indices_element_size; | |||||
| const int8_t *int8_in_m = int8_in + inner_size * m * limit * data_size; | |||||
| int8_t *int8_out_m = int8_out + inner_size * m * indices_element_size * data_size; | |||||
| for (int i = 0; i < indices_element_size; ++i) { | for (int i = 0; i < indices_element_size; ++i) { | ||||
| if (indices[i] < 0 || indices[i] > limit) { | if (indices[i] < 0 || indices[i] > limit) { | ||||
| return NNACL_ERR; | return NNACL_ERR; | ||||
| } | } | ||||
| memcpy(outputm + i * inner_size, inputm + indices[i] * inner_size, sizeof(int32_t) * inner_size); | |||||
| memcpy(int8_out_m + i * inner_size * data_size, int8_in_m + indices[i] * inner_size * data_size, | |||||
| data_size * inner_size); | |||||
| } | } | ||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| @@ -24,10 +24,8 @@ | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| int GatherFp32(const float *input, int outer_size, int inner_size, int limit, const int *indices, | |||||
| int indices_element_size, float *output); | |||||
| int GatherInt32(const int32_t *input, int outer_size, int inner_size, int limit, const int *indices, | |||||
| int indices_element_size, int32_t *output); | |||||
| int Gather(const void *input, int outer_size, int inner_size, int limit, const int *indices, int indices_element_size, | |||||
| void *output, int data_size); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -20,7 +20,6 @@ | |||||
| #include "src/runtime/runtime_api.h" | #include "src/runtime/runtime_api.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | #include "src/runtime/kernel/arm/fp16/common_fp16.h" | ||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | using mindspore::kernel::KERNEL_ARCH::kCPU; | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| @@ -47,35 +46,6 @@ int ActivationFp16CPUKernel::Init() { | |||||
| int ActivationFp16CPUKernel::ReSize() { return RET_OK; } | int ActivationFp16CPUKernel::ReSize() { return RET_OK; } | ||||
| int ActivationFp16CPUKernel::MallocTmpBuffer() { | |||||
| fp16_input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); | |||||
| if (fp16_input_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc data failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| fp16_output_ = MallocOutputFp16(out_tensors_.at(0), context_); | |||||
| if (fp16_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc data failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void ActivationFp16CPUKernel::FreeTmpBuffer() { | |||||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| if (fp16_input_ != nullptr) { | |||||
| context_->allocator->Free(fp16_input_); | |||||
| fp16_input_ = nullptr; | |||||
| } | |||||
| } | |||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| if (fp16_output_ != nullptr) { | |||||
| context_->allocator->Free(fp16_output_); | |||||
| fp16_output_ = nullptr; | |||||
| } | |||||
| } | |||||
| } | |||||
| int ActivationFp16CPUKernel::DoActivation(int task_id) { | int ActivationFp16CPUKernel::DoActivation(int task_id) { | ||||
| auto length = in_tensors_.at(0)->ElementsNum(); | auto length = in_tensors_.at(0)->ElementsNum(); | ||||
| @@ -115,24 +85,18 @@ int ActivationFp16Run(void *cdata, int task_id) { | |||||
| } | } | ||||
| int ActivationFp16CPUKernel::Run() { | int ActivationFp16CPUKernel::Run() { | ||||
| auto ret = MallocTmpBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| FreeTmpBuffer(); | |||||
| return ret; | |||||
| } | |||||
| auto input_tensor = in_tensors_.at(0); | |||||
| auto output_tensor = out_tensors_.at(0); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| fp16_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| int error_code = ParallelLaunch(this->context_->thread_pool_, ActivationFp16Run, this, thread_count_); | int error_code = ParallelLaunch(this->context_->thread_pool_, ActivationFp16Run, this, thread_count_); | ||||
| if (error_code != RET_OK) { | if (error_code != RET_OK) { | ||||
| MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]"; | MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]"; | ||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto out_tensor = out_tensors_.at(0); | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| Float16ToFloat32(fp16_output_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum()); | |||||
| } | |||||
| FreeTmpBuffer(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -37,8 +37,6 @@ class ActivationFp16CPUKernel : public LiteKernel { | |||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int DoActivation(int task_id); | int DoActivation(int task_id); | ||||
| int MallocTmpBuffer(); | |||||
| void FreeTmpBuffer(); | |||||
| private: | private: | ||||
| int thread_count_; | int thread_count_; | ||||
| @@ -209,6 +209,7 @@ int ArithmeticFP16CPUKernel::Run() { | |||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticsRunFp16, this, context_->thread_num_); | auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticsRunFp16, this, context_->thread_num_); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "ArithmeticsRunFp16 run error error_code[" << ret << "]"; | MS_LOG(ERROR) << "ArithmeticsRunFp16 run error error_code[" << ret << "]"; | ||||
| @@ -16,7 +16,6 @@ | |||||
| #include "src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h" | #include "src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h" | ||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | #include "src/runtime/kernel/arm/fp16/common_fp16.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "nnacl/fp16/arithmetic_self_fp16.h" | #include "nnacl/fp16/arithmetic_self_fp16.h" | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| @@ -72,36 +71,17 @@ int ArithmeticSelfFp16CPUKernel::DoExecute(int task_id) { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| void ArithmeticSelfFp16CPUKernel::FreeInputAndOutput() { | |||||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(input_fp16_ptr_); | |||||
| input_fp16_ptr_ = nullptr; | |||||
| } | |||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(output_fp16_ptr_); | |||||
| output_fp16_ptr_ = nullptr; | |||||
| } | |||||
| } | |||||
| int ArithmeticSelfFp16CPUKernel::Run() { | int ArithmeticSelfFp16CPUKernel::Run() { | ||||
| auto input_tensor = in_tensors_.at(0); | auto input_tensor = in_tensors_.at(0); | ||||
| auto output_tensor = out_tensors_.at(0); | auto output_tensor = out_tensors_.at(0); | ||||
| input_fp16_ptr_ = ConvertInputFp32toFp16(input_tensor, context_); | |||||
| output_fp16_ptr_ = MallocOutputFp16(output_tensor, context_); | |||||
| if (input_fp16_ptr_ == nullptr || output_fp16_ptr_ == nullptr) { | |||||
| FreeInputAndOutput(); | |||||
| MS_LOG(ERROR) << "input or output is nullptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| input_fp16_ptr_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| output_fp16_ptr_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfRun, this, op_parameter_->thread_num_); | auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfRun, this, op_parameter_->thread_num_); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]"; | MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]"; | ||||
| } | } | ||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| Float16ToFloat32(output_fp16_ptr_, reinterpret_cast<float *>(output_tensor->MutableData()), | |||||
| output_tensor->ElementsNum()); | |||||
| } | |||||
| FreeInputAndOutput(); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -35,7 +35,6 @@ class ArithmeticSelfFp16CPUKernel : public ArithmeticSelfCPUKernel { | |||||
| int DoExecute(int task_id) override; | int DoExecute(int task_id) override; | ||||
| private: | private: | ||||
| void FreeInputAndOutput(); | |||||
| ArithmeticSelfFp16Func GetArithmeticSelfFp16Fun(int primitive_type); | ArithmeticSelfFp16Func GetArithmeticSelfFp16Fun(int primitive_type); | ||||
| ArithmeticSelfFp16Func fp16_func_ = nullptr; | ArithmeticSelfFp16Func fp16_func_ = nullptr; | ||||
| float16_t *input_fp16_ptr_ = nullptr; | float16_t *input_fp16_ptr_ = nullptr; | ||||
| @@ -49,37 +49,18 @@ static int CropFp16Run(void *cdata, int task_id) { | |||||
| } | } | ||||
| int CropFp16CPUKernel::Run() { | int CropFp16CPUKernel::Run() { | ||||
| input_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(kInputIndex), context_); | |||||
| output_ptr_ = MallocOutputFp16(out_tensors_.at(kOutputIndex), context_); | |||||
| if (input_ptr_ == nullptr || output_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "input or output is nullptr"; | |||||
| FreeInputAndOutput(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto input_tensor = in_tensors_.at(0); | |||||
| auto output_tensor = out_tensors_.at(0); | |||||
| input_ptr_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| output_ptr_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, CropFp16Run, this, crop_para_->thread_count_); | auto ret = ParallelLaunch(this->context_->thread_pool_, CropFp16Run, this, crop_para_->thread_count_); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "ParallelLaunch failed: " << ret; | MS_LOG(ERROR) << "ParallelLaunch failed: " << ret; | ||||
| FreeInputAndOutput(); | |||||
| } | |||||
| if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) { | |||||
| Float16ToFloat32(output_ptr_, reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->data_c()), | |||||
| out_tensors_.at(kOutputIndex)->ElementsNum()); | |||||
| } | } | ||||
| FreeInputAndOutput(); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| void CropFp16CPUKernel::FreeInputAndOutput() { | |||||
| if (in_tensors_.at(kInputIndex)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(input_ptr_); | |||||
| input_ptr_ = nullptr; | |||||
| } | |||||
| if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(output_ptr_); | |||||
| output_ptr_ = nullptr; | |||||
| } | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Crop, LiteKernelCreator<CropFp16CPUKernel>) | REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Crop, LiteKernelCreator<CropFp16CPUKernel>) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -21,7 +21,6 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "nnacl/crop_parameter.h" | #include "nnacl/crop_parameter.h" | ||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "nnacl/fp16/crop_fp16.h" | #include "nnacl/fp16/crop_fp16.h" | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/runtime/kernel/arm/base/crop_base.h" | #include "src/runtime/kernel/arm/base/crop_base.h" | ||||
| @@ -44,7 +43,6 @@ class CropFp16CPUKernel : public CropBaseCPUKernel { | |||||
| private: | private: | ||||
| float16_t *input_ptr_ = nullptr; | float16_t *input_ptr_ = nullptr; | ||||
| float16_t *output_ptr_ = nullptr; | float16_t *output_ptr_ = nullptr; | ||||
| void FreeInputAndOutput(); | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -16,7 +16,6 @@ | |||||
| #include "src/runtime/kernel/arm/fp16/pad_fp16.h" | #include "src/runtime/kernel/arm/fp16/pad_fp16.h" | ||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | #include "src/runtime/kernel/arm/fp16/common_fp16.h" | ||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "src/runtime/runtime_api.h" | #include "src/runtime/runtime_api.h" | ||||
| @@ -43,16 +42,10 @@ int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) { | |||||
| int PadFp16CPUKernel::Run() { | int PadFp16CPUKernel::Run() { | ||||
| auto input_tensor = in_tensors_.at(0); | auto input_tensor = in_tensors_.at(0); | ||||
| auto output_tensor = out_tensors_.at(0); | auto output_tensor = out_tensors_.at(0); | ||||
| is_input_fp32_ = input_tensor->data_type() == kNumberTypeFloat32; | |||||
| is_output_fp32_ = output_tensor->data_type() == kNumberTypeFloat32; | |||||
| input_ = ConvertInputFp32toFp16(input_tensor, context_); | |||||
| output_ = MallocOutputFp16(output_tensor, context_); | |||||
| if (input_ == nullptr || output_ == nullptr) { | |||||
| FreeInputAndOutput(); | |||||
| MS_LOG(ERROR) << "input or output is nullptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| input_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| output_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| int ret = 0; | int ret = 0; | ||||
| if (pad_param_->pad_mode_ == static_cast<int>(schema::PaddingMode_CONSTANT)) { | if (pad_param_->pad_mode_ == static_cast<int>(schema::PaddingMode_CONSTANT)) { | ||||
| if (pad_param_->constant_value_ - 0.0f < 1e-5) { | if (pad_param_->constant_value_ - 0.0f < 1e-5) { | ||||
| @@ -73,22 +66,8 @@ int PadFp16CPUKernel::Run() { | |||||
| MS_LOG(ERROR) << "Pad Reflect or Symmetric mode run error, error_code[" << ret << "]"; | MS_LOG(ERROR) << "Pad Reflect or Symmetric mode run error, error_code[" << ret << "]"; | ||||
| } | } | ||||
| } | } | ||||
| if (is_output_fp32_) { | |||||
| Float16ToFloat32(output_, reinterpret_cast<float *>(output_tensor->MutableData()), output_tensor->ElementsNum()); | |||||
| } | |||||
| FreeInputAndOutput(); | |||||
| return ret; | |||||
| } | |||||
| void PadFp16CPUKernel::FreeInputAndOutput() { | |||||
| if (is_input_fp32_) { | |||||
| context_->allocator->Free(input_); | |||||
| input_ = nullptr; | |||||
| } | |||||
| if (is_output_fp32_) { | |||||
| context_->allocator->Free(output_); | |||||
| output_ = nullptr; | |||||
| } | |||||
| return ret; | |||||
| } | } | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Pad, LiteKernelCreator<PadFp16CPUKernel>) | REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Pad, LiteKernelCreator<PadFp16CPUKernel>) | ||||
| @@ -35,9 +35,6 @@ class PadFp16CPUKernel : public PadCPUKernel { | |||||
| int RunMirrorPadImpl(int task_id) override; | int RunMirrorPadImpl(int task_id) override; | ||||
| private: | private: | ||||
| void FreeInputAndOutput(); | |||||
| bool is_input_fp32_ = false; | |||||
| bool is_output_fp32_ = false; | |||||
| float16_t *input_ = nullptr; | float16_t *input_ = nullptr; | ||||
| float16_t *output_ = nullptr; | float16_t *output_ = nullptr; | ||||
| }; | }; | ||||
| @@ -20,7 +20,6 @@ | |||||
| #include "src/runtime/runtime_api.h" | #include "src/runtime/runtime_api.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | #include "src/runtime/kernel/arm/fp16/common_fp16.h" | ||||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | using mindspore::kernel::KERNEL_ARCH::kCPU; | ||||
| @@ -84,31 +83,17 @@ static int PoolingFp16Impl(void *cdata, int task_id) { | |||||
| } | } | ||||
| int PoolingFp16CPUKernel::Run() { | int PoolingFp16CPUKernel::Run() { | ||||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||||
| auto in_data_type_ = input_tensor->data_type(); | |||||
| MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16); | |||||
| fp16_input_ = ConvertInputFp32toFp16(input_tensor, context_); | |||||
| auto input_tensor = in_tensors_.at(0); | |||||
| auto output_tensor = out_tensors_.at(0); | |||||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||||
| auto out_data_type_ = out_tensor->data_type(); | |||||
| MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16); | |||||
| fp16_output_ = MallocOutputFp16(out_tensor, context_); | |||||
| fp16_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| fp16_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| int error_code = ParallelLaunch(this->context_->thread_pool_, PoolingFp16Impl, this, thread_count_); | int error_code = ParallelLaunch(this->context_->thread_pool_, PoolingFp16Impl, this, thread_count_); | ||||
| if (error_code != RET_OK) { | if (error_code != RET_OK) { | ||||
| MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]"; | MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (in_data_type_ == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(fp16_input_); | |||||
| } | |||||
| if (out_data_type_ == kNumberTypeFloat32) { | |||||
| auto out_ele_num = out_tensor->ElementsNum(); | |||||
| auto output_addr = reinterpret_cast<float *>(out_tensor->MutableData()); | |||||
| Float16ToFloat32(fp16_output_, output_addr, out_ele_num); | |||||
| context_->allocator->Free(fp16_output_); | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -30,39 +30,12 @@ namespace mindspore::kernel { | |||||
| int ReshapeFp16CPUKernel::Run() { | int ReshapeFp16CPUKernel::Run() { | ||||
| auto in_tensor = in_tensors_.at(kInputIndex); | auto in_tensor = in_tensors_.at(kInputIndex); | ||||
| auto out_tensor = out_tensors_.at(kOutputIndex); | auto out_tensor = out_tensors_.at(kOutputIndex); | ||||
| auto input_ptr = in_tensor->MutableData(); | |||||
| auto output_ptr = out_tensor->MutableData(); | |||||
| size_t data_size = out_tensor->Size(); | |||||
| auto in_datatype = in_tensor->data_type(); | |||||
| auto out_datatype = out_tensor->data_type(); | |||||
| if (in_datatype != out_datatype) { | |||||
| if (in_datatype == kNumberTypeFloat32 && out_datatype == kNumberTypeFloat16) { | |||||
| input_ptr = context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t)); | |||||
| if (input_ptr == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc in tensor fail!"; | |||||
| return mindspore::lite::RET_MEMORY_FAILED; | |||||
| } | |||||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensor->MutableData()), reinterpret_cast<float16_t *>(input_ptr), | |||||
| in_tensor->ElementsNum()); | |||||
| } else if ((in_datatype == kNumberTypeFloat16 && out_datatype == kNumberTypeFloat32)) { | |||||
| input_ptr = context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float)); | |||||
| if (input_ptr == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc in tensor fail!"; | |||||
| return mindspore::lite::RET_MEMORY_FAILED; | |||||
| } | |||||
| Float16ToFloat32(reinterpret_cast<float16_t *>(in_tensor->MutableData()), reinterpret_cast<float *>(input_ptr), | |||||
| in_tensor->ElementsNum()); | |||||
| } else { | |||||
| MS_LOG(ERROR) << "unsupported data type, in_datatype: " << in_datatype << ",out_datatype: " << out_datatype; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| float16_t *input_ptr = reinterpret_cast<float16_t *>(in_tensor->data_c()); | |||||
| float16_t *output_ptr = reinterpret_cast<float16_t *>(out_tensor->data_c()); | |||||
| Reshape(input_ptr, output_ptr, out_tensor->Size()); | |||||
| Reshape(input_ptr, output_ptr, data_size); | |||||
| if (in_datatype != out_datatype) { | |||||
| context_->allocator->Free(input_ptr); | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -16,7 +16,6 @@ | |||||
| #include "src/runtime/kernel/arm/fp16/slice_fp16.h" | #include "src/runtime/kernel/arm/fp16/slice_fp16.h" | ||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | #include "src/runtime/kernel/arm/fp16/common_fp16.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "nnacl/fp16/slice_fp16.h" | #include "nnacl/fp16/slice_fp16.h" | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| @@ -31,13 +30,12 @@ int SliceFp16CPUKernel::SliceParallelRun(int thread_id) { | |||||
| } | } | ||||
| int SliceFp16CPUKernel::Run() { | int SliceFp16CPUKernel::Run() { | ||||
| input_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); | |||||
| output_fp16_ = MallocOutputFp16(out_tensors_.at(0), context_); | |||||
| if (input_fp16_ == nullptr || output_fp16_ == nullptr) { | |||||
| FreeInputAndOutput(); | |||||
| MS_LOG(ERROR) << "input or output is nullptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto input_tensor = in_tensors_.at(0); | |||||
| auto output_tensor = out_tensors_.at(0); | |||||
| input_fp16_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| output_fp16_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| if (param_->size_[1] < op_parameter_->thread_num_) { | if (param_->size_[1] < op_parameter_->thread_num_) { | ||||
| DoSliceFp16NoParallel(input_fp16_, output_fp16_, param_); | DoSliceFp16NoParallel(input_fp16_, output_fp16_, param_); | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -46,24 +44,8 @@ int SliceFp16CPUKernel::Run() { | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "slice launch fail!ret: " << ret; | MS_LOG(ERROR) << "slice launch fail!ret: " << ret; | ||||
| } | } | ||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| Float16ToFloat32(output_fp16_, reinterpret_cast<float *>(out_tensors_.at(0)->MutableData()), | |||||
| out_tensors_.at(0)->ElementsNum()); | |||||
| } | |||||
| FreeInputAndOutput(); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| void SliceFp16CPUKernel::FreeInputAndOutput() { | |||||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(input_fp16_); | |||||
| input_fp16_ = nullptr; | |||||
| } | |||||
| if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(output_fp16_); | |||||
| output_fp16_ = nullptr; | |||||
| } | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Slice, LiteKernelCreator<SliceFp16CPUKernel>) | REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Slice, LiteKernelCreator<SliceFp16CPUKernel>) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -32,7 +32,6 @@ class SliceFp16CPUKernel : public SliceCPUKernel { | |||||
| int SliceParallelRun(int thread_id) override; | int SliceParallelRun(int thread_id) override; | ||||
| protected: | protected: | ||||
| void FreeInputAndOutput(); | |||||
| float16_t *input_fp16_ = nullptr; | float16_t *input_fp16_ = nullptr; | ||||
| float16_t *output_fp16_ = nullptr; | float16_t *output_fp16_ = nullptr; | ||||
| }; | }; | ||||
| @@ -69,17 +69,6 @@ int SoftmaxFp16CPUKernel::MallocTmpBuffer() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float16_t)); | memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float16_t)); | ||||
| input_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(kInputIndex), context_); | |||||
| if (input_fp16_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc data failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| output_fp16_ = MallocOutputFp16(out_tensors_.at(kOutputIndex), context_); | |||||
| if (output_fp16_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc data failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -88,19 +77,6 @@ void SoftmaxFp16CPUKernel::FreeTmpBuffer() { | |||||
| context_->allocator->Free(sum_data_); | context_->allocator->Free(sum_data_); | ||||
| sum_data_ = nullptr; | sum_data_ = nullptr; | ||||
| } | } | ||||
| if (in_tensors_.at(kInputIndex)->data_type() == kNumberTypeFloat32) { | |||||
| if (input_fp16_ != nullptr) { | |||||
| context_->allocator->Free(input_fp16_); | |||||
| input_fp16_ = nullptr; | |||||
| } | |||||
| } | |||||
| if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) { | |||||
| if (output_fp16_ != nullptr) { | |||||
| context_->allocator->Free(output_fp16_); | |||||
| output_fp16_ = nullptr; | |||||
| } | |||||
| } | |||||
| } | } | ||||
| int SoftmaxFp16CPUKernel::Run() { | int SoftmaxFp16CPUKernel::Run() { | ||||
| @@ -110,11 +86,15 @@ int SoftmaxFp16CPUKernel::Run() { | |||||
| MS_LOG(ERROR) << "MallocTmpBuffer failed"; | MS_LOG(ERROR) << "MallocTmpBuffer failed"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto input_tensor = in_tensors_.at(0); | |||||
| auto output_tensor = out_tensors_.at(0); | |||||
| input_fp16_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| output_fp16_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_); | SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_); | ||||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||||
| if (out_tensor->data_type() == kNumberTypeFloat32) { | |||||
| Float16ToFloat32(output_fp16_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum()); | |||||
| } | |||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -17,7 +17,6 @@ | |||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | #include "src/runtime/kernel/arm/fp16/common_fp16.h" | ||||
| #include "src/runtime/kernel/arm/base/split_base.h" | #include "src/runtime/kernel/arm/base/split_base.h" | ||||
| #include "nnacl/fp16/split_fp16.h" | #include "nnacl/fp16/split_fp16.h" | ||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "nnacl/split.h" | #include "nnacl/split.h" | ||||
| #include "nnacl/split_parameter.h" | #include "nnacl/split_parameter.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| @@ -31,7 +30,6 @@ using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_Split; | using mindspore::schema::PrimitiveType_Split; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int SplitFp16CPUKernel::Init() { | int SplitFp16CPUKernel::Init() { | ||||
| auto ret = SplitBaseCPUKernel::Init(); | auto ret = SplitBaseCPUKernel::Init(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -76,45 +74,19 @@ static int SplitFp16Run(void *cdata, int task_id) { | |||||
| } | } | ||||
| int SplitFp16CPUKernel::Run() { | int SplitFp16CPUKernel::Run() { | ||||
| input_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); | |||||
| if (input_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "input or output is nullptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto input_tensor = in_tensors_.at(0); | |||||
| input_ptr_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||||
| for (int i = 0; i < param->num_split_; i++) { | for (int i = 0; i < param->num_split_; i++) { | ||||
| output_ptr_.at(i) = MallocOutputFp16(out_tensors_.at(i), context_); | |||||
| if (output_ptr_.at(i) == nullptr) { | |||||
| FreeInputAndOutput(); | |||||
| MS_LOG(ERROR) << "input or output is nullptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto output_tensor = out_tensors_.at(i); | |||||
| output_ptr_.at(i) = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| } | } | ||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, SplitFp16Run, this, thread_n_num_); | auto ret = ParallelLaunch(this->context_->thread_pool_, SplitFp16Run, this, thread_n_num_); | ||||
| for (int i = 0; i < param->num_split_; i++) { | |||||
| if (out_tensors_.at(i)->data_type() == kNumberTypeFloat32) { | |||||
| Float16ToFloat32(output_ptr_.at(i), reinterpret_cast<float *>(out_tensors_.at(i)->MutableData()), | |||||
| out_tensors_.at(i)->ElementsNum()); | |||||
| } | |||||
| } | |||||
| FreeInputAndOutput(); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "split error error_code[" << ret << "]"; | MS_LOG(ERROR) << "split error error_code[" << ret << "]"; | ||||
| } | } | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| void SplitFp16CPUKernel::FreeInputAndOutput() { | |||||
| if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(input_ptr_); | |||||
| input_ptr_ = nullptr; | |||||
| } | |||||
| for (int i = 0; i < param->num_split_; i++) { | |||||
| if (out_tensors_.at(i)->data_type() == kNumberTypeFloat32) { | |||||
| context_->allocator->Free(output_ptr_.at(i)); | |||||
| output_ptr_.at(i) = nullptr; | |||||
| } | |||||
| } | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Split, LiteKernelCreator<SplitFp16CPUKernel>) | REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Split, LiteKernelCreator<SplitFp16CPUKernel>) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -39,7 +39,6 @@ class SplitFp16CPUKernel : public SplitBaseCPUKernel { | |||||
| private: | private: | ||||
| float16_t *input_ptr_ = nullptr; | float16_t *input_ptr_ = nullptr; | ||||
| std::vector<float16_t *> output_ptr_; | std::vector<float16_t *> output_ptr_; | ||||
| void FreeInputAndOutput(); | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -41,12 +41,6 @@ int GatherCPUKernel::DoGather(int task_id) { | |||||
| auto indices_tensor = in_tensors_.at(1); | auto indices_tensor = in_tensors_.at(1); | ||||
| auto out_tensor = out_tensors_.at(0); | auto out_tensor = out_tensors_.at(0); | ||||
| auto input_ptr = reinterpret_cast<float *>(input_tensor->MutableData()); | |||||
| auto output_ptr = reinterpret_cast<float *>(out_tensor->MutableData()); | |||||
| auto input_int32 = reinterpret_cast<int32_t *>(input_tensor->MutableData()); | |||||
| auto output_int32 = reinterpret_cast<int32_t *>(out_tensor->MutableData()); | |||||
| auto in_shape = input_tensor->shape(); | auto in_shape = input_tensor->shape(); | ||||
| int in_rank = in_shape.size(); | int in_rank = in_shape.size(); | ||||
| int indices_element_size = indices_tensor->ElementsNum(); | int indices_element_size = indices_tensor->ElementsNum(); | ||||
| @@ -65,16 +59,15 @@ int GatherCPUKernel::DoGather(int task_id) { | |||||
| int count = MSMIN(stride, outer_size - stride * task_id); | int count = MSMIN(stride, outer_size - stride * task_id); | ||||
| auto thread_stride = stride * task_id; | auto thread_stride = stride * task_id; | ||||
| int error_code; | |||||
| if (input_tensor->data_type() == kNumberTypeInt32) { | |||||
| input_int32 += thread_stride * limit; | |||||
| output_int32 += thread_stride * indices_element_size; | |||||
| error_code = GatherInt32(input_int32, count, inner_size, limit, indices_data_, indices_element_size, output_int32); | |||||
| } else { | |||||
| input_ptr += thread_stride * limit; | |||||
| output_ptr += thread_stride * indices_element_size; | |||||
| error_code = GatherFp32(input_ptr, count, inner_size, limit, indices_data_, indices_element_size, output_ptr); | |||||
| } | |||||
| int8_t *int8_in = reinterpret_cast<int8_t *>(input_tensor->data_c()); | |||||
| int8_t *int8_out = reinterpret_cast<int8_t *>(out_tensor->data_c()); | |||||
| int data_size = lite::DataTypeSize(input_tensor->data_type()); | |||||
| int8_in += thread_stride * limit * data_size; | |||||
| int8_out += thread_stride * indices_element_size * data_size; | |||||
| int error_code = Gather(int8_in, count, inner_size, limit, indices_data_, indices_element_size, int8_out, data_size); | |||||
| return error_code; | return error_code; | ||||
| } | } | ||||