diff --git a/mindspore/lite/nnacl/base/gather_base.c b/mindspore/lite/nnacl/base/gather_base.c index 6791623755..c42bda8e80 100644 --- a/mindspore/lite/nnacl/base/gather_base.c +++ b/mindspore/lite/nnacl/base/gather_base.c @@ -15,31 +15,22 @@ */ #include "nnacl/base/gather_base.h" -int GatherFp32(const float *input, int outer_size, int inner_size, int limit, const int *indices, - int indices_element_size, float *output) { - for (int m = 0; m < outer_size; ++m) { - const float *inputm = input + inner_size * m * limit; - float *outputm = output + inner_size * m * indices_element_size; - for (int i = 0; i < indices_element_size; ++i) { - if (indices[i] < 0 || indices[i] > limit) { - return NNACL_ERR; - } - memcpy(outputm + i * inner_size, inputm + indices[i] * inner_size, sizeof(float) * inner_size); - } - } - return NNACL_OK; -} -int GatherInt32(const int32_t *input, int outer_size, int inner_size, int limit, const int *indices, - int indices_element_size, int32_t *output) { +int Gather(const void *input, int outer_size, int inner_size, int limit, const int *indices, int indices_element_size, + void *output, int data_size) { + const int8_t *int8_in = (int8_t *)input; + int8_t *int8_out = (int8_t *)output; + for (int m = 0; m < outer_size; ++m) { - const int32_t *inputm = input + inner_size * m * limit; - int32_t *outputm = output + inner_size * m * indices_element_size; + const int8_t *int8_in_m = int8_in + inner_size * m * limit * data_size; + int8_t *int8_out_m = int8_out + inner_size * m * indices_element_size * data_size; + for (int i = 0; i < indices_element_size; ++i) { if (indices[i] < 0 || indices[i] > limit) { return NNACL_ERR; } - memcpy(outputm + i * inner_size, inputm + indices[i] * inner_size, sizeof(int32_t) * inner_size); + memcpy(int8_out_m + i * inner_size * data_size, int8_in_m + indices[i] * inner_size * data_size, + data_size * inner_size); } } return NNACL_OK; diff --git a/mindspore/lite/nnacl/base/gather_base.h b/mindspore/lite/nnacl/base/gather_base.h index f38ed951c0..1c3eb0c9b1 100644 --- a/mindspore/lite/nnacl/base/gather_base.h +++ b/mindspore/lite/nnacl/base/gather_base.h @@ -24,10 +24,8 @@ #ifdef __cplusplus extern "C" { #endif -int GatherFp32(const float *input, int outer_size, int inner_size, int limit, const int *indices, - int indices_element_size, float *output); -int GatherInt32(const int32_t *input, int outer_size, int inner_size, int limit, const int *indices, - int indices_element_size, int32_t *output); +int Gather(const void *input, int outer_size, int inner_size, int limit, const int *indices, int indices_element_size, + void *output, int data_size); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc index 9944b630fb..e1372fc2e0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc @@ -20,7 +20,6 @@ #include "src/runtime/runtime_api.h" #include "include/errorcode.h" #include "src/runtime/kernel/arm/fp16/common_fp16.h" -#include "nnacl/fp16/cast_fp16.h" using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::lite::KernelRegistrar; @@ -47,35 +46,6 @@ int ActivationFp16CPUKernel::Init() { int ActivationFp16CPUKernel::ReSize() { return RET_OK; } -int ActivationFp16CPUKernel::MallocTmpBuffer() { - fp16_input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "malloc data failed"; - return RET_ERROR; - } - fp16_output_ = MallocOutputFp16(out_tensors_.at(0), context_); - if (fp16_output_ == nullptr) { - MS_LOG(ERROR) << "malloc data failed"; - return RET_ERROR; - } - return RET_OK; -} - -void ActivationFp16CPUKernel::FreeTmpBuffer() { - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - if (fp16_input_ != nullptr) { - context_->allocator->Free(fp16_input_); - fp16_input_ = nullptr; - } - } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - if (fp16_output_ != nullptr) { - context_->allocator->Free(fp16_output_); - fp16_output_ = nullptr; - } - } -} - int ActivationFp16CPUKernel::DoActivation(int task_id) { auto length = in_tensors_.at(0)->ElementsNum(); @@ -115,24 +85,18 @@ int ActivationFp16Run(void *cdata, int task_id) { } int ActivationFp16CPUKernel::Run() { - auto ret = MallocTmpBuffer(); - if (ret != RET_OK) { - FreeTmpBuffer(); - return ret; - } + auto input_tensor = in_tensors_.at(0); + auto output_tensor = out_tensors_.at(0); + + fp16_input_ = reinterpret_cast(input_tensor->data_c()); + fp16_output_ = reinterpret_cast(output_tensor->data_c()); int error_code = ParallelLaunch(this->context_->thread_pool_, ActivationFp16Run, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]"; - FreeTmpBuffer(); return RET_ERROR; } - auto out_tensor = out_tensors_.at(0); - if (out_tensor->data_type() == kNumberTypeFloat32) { - Float16ToFloat32(fp16_output_, reinterpret_cast(out_tensor->MutableData()), out_tensor->ElementsNum()); - } - FreeTmpBuffer(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h index 902091d7be..5102ba30a9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.h @@ -37,8 +37,6 @@ class ActivationFp16CPUKernel : public LiteKernel { int ReSize() override; int Run() override; int DoActivation(int task_id); - int MallocTmpBuffer(); - void FreeTmpBuffer(); private: int thread_count_; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc index b56a581b7e..888bb8b91d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc @@ -209,6 +209,7 @@ int ArithmeticFP16CPUKernel::Run() { FreeTmpBuffer(); return RET_ERROR; } + auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticsRunFp16, this, context_->thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "ArithmeticsRunFp16 run error error_code[" << ret << "]"; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc index c05f6c46c2..7140439421 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc @@ -16,7 +16,6 @@ #include "src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h" #include "src/runtime/kernel/arm/fp16/common_fp16.h" #include "src/kernel_registry.h" -#include "nnacl/fp16/cast_fp16.h" #include "nnacl/fp16/arithmetic_self_fp16.h" using mindspore::lite::KernelRegistrar; @@ -72,36 +71,17 @@ int ArithmeticSelfFp16CPUKernel::DoExecute(int task_id) { return ret; } -void ArithmeticSelfFp16CPUKernel::FreeInputAndOutput() { - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(input_fp16_ptr_); - input_fp16_ptr_ = nullptr; - } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(output_fp16_ptr_); - output_fp16_ptr_ = nullptr; - } -} - int ArithmeticSelfFp16CPUKernel::Run() { auto input_tensor = in_tensors_.at(0); auto output_tensor = out_tensors_.at(0); - input_fp16_ptr_ = ConvertInputFp32toFp16(input_tensor, context_); - output_fp16_ptr_ = MallocOutputFp16(output_tensor, context_); - if (input_fp16_ptr_ == nullptr || output_fp16_ptr_ == nullptr) { - FreeInputAndOutput(); - MS_LOG(ERROR) << "input or output is nullptr"; - return RET_ERROR; - } + + input_fp16_ptr_ = reinterpret_cast(input_tensor->data_c()); + output_fp16_ptr_ = reinterpret_cast(output_tensor->data_c()); + auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfRun, this, op_parameter_->thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]"; } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - Float16ToFloat32(output_fp16_ptr_, reinterpret_cast(output_tensor->MutableData()), - output_tensor->ElementsNum()); - } - FreeInputAndOutput(); return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h index 660c7fcde1..f30bfe0734 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.h @@ -35,7 +35,6 @@ class ArithmeticSelfFp16CPUKernel : public ArithmeticSelfCPUKernel { int DoExecute(int task_id) override; private: - void FreeInputAndOutput(); ArithmeticSelfFp16Func GetArithmeticSelfFp16Fun(int primitive_type); ArithmeticSelfFp16Func fp16_func_ = nullptr; float16_t *input_fp16_ptr_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc index d119aa8f31..626e0424b7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc @@ -49,37 +49,18 @@ static int CropFp16Run(void *cdata, int task_id) { } int CropFp16CPUKernel::Run() { - input_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(kInputIndex), context_); - output_ptr_ = MallocOutputFp16(out_tensors_.at(kOutputIndex), context_); - if (input_ptr_ == nullptr || output_ptr_ == nullptr) { - MS_LOG(ERROR) << "input or output is nullptr"; - FreeInputAndOutput(); - return RET_ERROR; - } + auto input_tensor = in_tensors_.at(0); + auto output_tensor = out_tensors_.at(0); + + input_ptr_ = reinterpret_cast(input_tensor->data_c()); + output_ptr_ = reinterpret_cast(output_tensor->data_c()); auto ret = ParallelLaunch(this->context_->thread_pool_, CropFp16Run, this, crop_para_->thread_count_); if (ret != RET_OK) { MS_LOG(ERROR) << "ParallelLaunch failed: " << ret; - FreeInputAndOutput(); - } - if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) { - Float16ToFloat32(output_ptr_, reinterpret_cast(out_tensors_.at(kOutputIndex)->data_c()), - out_tensors_.at(kOutputIndex)->ElementsNum()); } - FreeInputAndOutput(); return ret; } -void CropFp16CPUKernel::FreeInputAndOutput() { - if (in_tensors_.at(kInputIndex)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(input_ptr_); - input_ptr_ = nullptr; - } - if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(output_ptr_); - output_ptr_ = nullptr; - } -} - REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Crop, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.h index 4e925f84e4..f55cddf0bf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.h @@ -21,7 +21,6 @@ #include #include "include/errorcode.h" #include "nnacl/crop_parameter.h" -#include "nnacl/fp16/cast_fp16.h" #include "nnacl/fp16/crop_fp16.h" #include "src/lite_kernel.h" #include "src/runtime/kernel/arm/base/crop_base.h" @@ -44,7 +43,6 @@ class CropFp16CPUKernel : public CropBaseCPUKernel { private: float16_t *input_ptr_ = nullptr; float16_t *output_ptr_ = nullptr; - void FreeInputAndOutput(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc index a694043d58..8fb4c8d5e5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc @@ -16,7 +16,6 @@ #include "src/runtime/kernel/arm/fp16/pad_fp16.h" #include "src/runtime/kernel/arm/fp16/common_fp16.h" -#include "nnacl/fp16/cast_fp16.h" #include "src/kernel_registry.h" #include "src/runtime/runtime_api.h" @@ -43,16 +42,10 @@ int PadFp16CPUKernel::RunMirrorPadImpl(int task_id) { int PadFp16CPUKernel::Run() { auto input_tensor = in_tensors_.at(0); auto output_tensor = out_tensors_.at(0); - is_input_fp32_ = input_tensor->data_type() == kNumberTypeFloat32; - is_output_fp32_ = output_tensor->data_type() == kNumberTypeFloat32; - input_ = ConvertInputFp32toFp16(input_tensor, context_); - output_ = MallocOutputFp16(output_tensor, context_); - if (input_ == nullptr || output_ == nullptr) { - FreeInputAndOutput(); - MS_LOG(ERROR) << "input or output is nullptr"; - return RET_ERROR; - } + input_ = reinterpret_cast(input_tensor->data_c()); + output_ = reinterpret_cast(output_tensor->data_c()); + int ret = 0; if (pad_param_->pad_mode_ == static_cast(schema::PaddingMode_CONSTANT)) { if (pad_param_->constant_value_ - 0.0f < 1e-5) { @@ -73,22 +66,8 @@ int PadFp16CPUKernel::Run() { MS_LOG(ERROR) << "Pad Reflect or Symmetric mode run error, error_code[" << ret << "]"; } } - if (is_output_fp32_) { - Float16ToFloat32(output_, reinterpret_cast(output_tensor->MutableData()), output_tensor->ElementsNum()); - } - FreeInputAndOutput(); - return ret; -} -void PadFp16CPUKernel::FreeInputAndOutput() { - if (is_input_fp32_) { - context_->allocator->Free(input_); - input_ = nullptr; - } - if (is_output_fp32_) { - context_->allocator->Free(output_); - output_ = nullptr; - } + return ret; } REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Pad, LiteKernelCreator) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h index 8a906644cc..660fd0415b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.h @@ -35,9 +35,6 @@ class PadFp16CPUKernel : public PadCPUKernel { int RunMirrorPadImpl(int task_id) override; private: - void FreeInputAndOutput(); - bool is_input_fp32_ = false; - bool is_output_fp32_ = false; float16_t *input_ = nullptr; float16_t *output_ = nullptr; }; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc index a34869d84d..db2a68666b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc @@ -20,7 +20,6 @@ #include "src/runtime/runtime_api.h" #include "include/errorcode.h" #include "nnacl/op_base.h" -#include "nnacl/fp16/cast_fp16.h" #include "src/runtime/kernel/arm/fp16/common_fp16.h" using mindspore::kernel::KERNEL_ARCH::kCPU; @@ -84,31 +83,17 @@ static int PoolingFp16Impl(void *cdata, int task_id) { } int PoolingFp16CPUKernel::Run() { - auto input_tensor = in_tensors_.at(kInputIndex); - auto in_data_type_ = input_tensor->data_type(); - MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16); - fp16_input_ = ConvertInputFp32toFp16(input_tensor, context_); + auto input_tensor = in_tensors_.at(0); + auto output_tensor = out_tensors_.at(0); - auto out_tensor = out_tensors_.at(kOutputIndex); - auto out_data_type_ = out_tensor->data_type(); - MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16); - fp16_output_ = MallocOutputFp16(out_tensor, context_); + fp16_input_ = reinterpret_cast(input_tensor->data_c()); + fp16_output_ = reinterpret_cast(output_tensor->data_c()); int error_code = ParallelLaunch(this->context_->thread_pool_, PoolingFp16Impl, this, thread_count_); if (error_code != RET_OK) { MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]"; return RET_ERROR; } - - if (in_data_type_ == kNumberTypeFloat32) { - context_->allocator->Free(fp16_input_); - } - if (out_data_type_ == kNumberTypeFloat32) { - auto out_ele_num = out_tensor->ElementsNum(); - auto output_addr = reinterpret_cast(out_tensor->MutableData()); - Float16ToFloat32(fp16_output_, output_addr, out_ele_num); - context_->allocator->Free(fp16_output_); - } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc index 7d01aa4159..42d280768d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reshape_fp16.cc @@ -30,39 +30,12 @@ namespace mindspore::kernel { int ReshapeFp16CPUKernel::Run() { auto in_tensor = in_tensors_.at(kInputIndex); auto out_tensor = out_tensors_.at(kOutputIndex); - auto input_ptr = in_tensor->MutableData(); - auto output_ptr = out_tensor->MutableData(); - size_t data_size = out_tensor->Size(); - auto in_datatype = in_tensor->data_type(); - auto out_datatype = out_tensor->data_type(); - if (in_datatype != out_datatype) { - if (in_datatype == kNumberTypeFloat32 && out_datatype == kNumberTypeFloat16) { - input_ptr = context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float16_t)); - if (input_ptr == nullptr) { - MS_LOG(ERROR) << "malloc in tensor fail!"; - return mindspore::lite::RET_MEMORY_FAILED; - } - Float32ToFloat16(reinterpret_cast(in_tensor->MutableData()), reinterpret_cast(input_ptr), - in_tensor->ElementsNum()); - } else if ((in_datatype == kNumberTypeFloat16 && out_datatype == kNumberTypeFloat32)) { - input_ptr = context_->allocator->Malloc(in_tensor->ElementsNum() * sizeof(float)); - if (input_ptr == nullptr) { - MS_LOG(ERROR) << "malloc in tensor fail!"; - return mindspore::lite::RET_MEMORY_FAILED; - } - Float16ToFloat32(reinterpret_cast(in_tensor->MutableData()), reinterpret_cast(input_ptr), - in_tensor->ElementsNum()); - } else { - MS_LOG(ERROR) << "unsupported data type, in_datatype: " << in_datatype << ",out_datatype: " << out_datatype; - return RET_ERROR; - } - } + float16_t *input_ptr = reinterpret_cast(in_tensor->data_c()); + float16_t *output_ptr = reinterpret_cast(out_tensor->data_c()); + + Reshape(input_ptr, output_ptr, out_tensor->Size()); - Reshape(input_ptr, output_ptr, data_size); - if (in_datatype != out_datatype) { - context_->allocator->Free(input_ptr); - } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc index fea556df97..1e5da3180c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc @@ -16,7 +16,6 @@ #include "src/runtime/kernel/arm/fp16/slice_fp16.h" #include "src/runtime/kernel/arm/fp16/common_fp16.h" #include "src/kernel_registry.h" -#include "nnacl/fp16/cast_fp16.h" #include "nnacl/fp16/slice_fp16.h" using mindspore::lite::KernelRegistrar; @@ -31,13 +30,12 @@ int SliceFp16CPUKernel::SliceParallelRun(int thread_id) { } int SliceFp16CPUKernel::Run() { - input_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); - output_fp16_ = MallocOutputFp16(out_tensors_.at(0), context_); - if (input_fp16_ == nullptr || output_fp16_ == nullptr) { - FreeInputAndOutput(); - MS_LOG(ERROR) << "input or output is nullptr"; - return RET_ERROR; - } + auto input_tensor = in_tensors_.at(0); + auto output_tensor = out_tensors_.at(0); + + input_fp16_ = reinterpret_cast(input_tensor->data_c()); + output_fp16_ = reinterpret_cast(output_tensor->data_c()); + if (param_->size_[1] < op_parameter_->thread_num_) { DoSliceFp16NoParallel(input_fp16_, output_fp16_, param_); return RET_OK; @@ -46,24 +44,8 @@ int SliceFp16CPUKernel::Run() { if (ret != RET_OK) { MS_LOG(ERROR) << "slice launch fail!ret: " << ret; } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - Float16ToFloat32(output_fp16_, reinterpret_cast(out_tensors_.at(0)->MutableData()), - out_tensors_.at(0)->ElementsNum()); - } - FreeInputAndOutput(); return ret; } -void SliceFp16CPUKernel::FreeInputAndOutput() { - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(input_fp16_); - input_fp16_ = nullptr; - } - if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(output_fp16_); - output_fp16_ = nullptr; - } -} - REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Slice, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.h index 3c1b200416..097db1ad4e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.h @@ -32,7 +32,6 @@ class SliceFp16CPUKernel : public SliceCPUKernel { int SliceParallelRun(int thread_id) override; protected: - void FreeInputAndOutput(); float16_t *input_fp16_ = nullptr; float16_t *output_fp16_ = nullptr; }; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc index 06e468fcd5..98fd9cb1ee 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc @@ -69,17 +69,6 @@ int SoftmaxFp16CPUKernel::MallocTmpBuffer() { return RET_ERROR; } memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float16_t)); - - input_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(kInputIndex), context_); - if (input_fp16_ == nullptr) { - MS_LOG(ERROR) << "malloc data failed"; - return RET_ERROR; - } - output_fp16_ = MallocOutputFp16(out_tensors_.at(kOutputIndex), context_); - if (output_fp16_ == nullptr) { - MS_LOG(ERROR) << "malloc data failed"; - return RET_ERROR; - } return RET_OK; } @@ -88,19 +77,6 @@ void SoftmaxFp16CPUKernel::FreeTmpBuffer() { context_->allocator->Free(sum_data_); sum_data_ = nullptr; } - if (in_tensors_.at(kInputIndex)->data_type() == kNumberTypeFloat32) { - if (input_fp16_ != nullptr) { - context_->allocator->Free(input_fp16_); - input_fp16_ = nullptr; - } - } - - if (out_tensors_.at(kOutputIndex)->data_type() == kNumberTypeFloat32) { - if (output_fp16_ != nullptr) { - context_->allocator->Free(output_fp16_); - output_fp16_ = nullptr; - } - } } int SoftmaxFp16CPUKernel::Run() { @@ -110,11 +86,15 @@ int SoftmaxFp16CPUKernel::Run() { MS_LOG(ERROR) << "MallocTmpBuffer failed"; return RET_ERROR; } + + auto input_tensor = in_tensors_.at(0); + auto output_tensor = out_tensors_.at(0); + + input_fp16_ = reinterpret_cast(input_tensor->data_c()); + output_fp16_ = reinterpret_cast(output_tensor->data_c()); + SoftmaxFp16(input_fp16_, output_fp16_, sum_data_, softmax_param_); - auto out_tensor = out_tensors_.at(kOutputIndex); - if (out_tensor->data_type() == kNumberTypeFloat32) { - Float16ToFloat32(output_fp16_, reinterpret_cast(out_tensor->MutableData()), out_tensor->ElementsNum()); - } + FreeTmpBuffer(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc index 7722e3f28e..e982582851 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc @@ -17,7 +17,6 @@ #include "src/runtime/kernel/arm/fp16/common_fp16.h" #include "src/runtime/kernel/arm/base/split_base.h" #include "nnacl/fp16/split_fp16.h" -#include "nnacl/fp16/cast_fp16.h" #include "nnacl/split.h" #include "nnacl/split_parameter.h" #include "src/kernel_registry.h" @@ -31,7 +30,6 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Split; namespace mindspore::kernel { - int SplitFp16CPUKernel::Init() { auto ret = SplitBaseCPUKernel::Init(); if (ret != RET_OK) { @@ -76,45 +74,19 @@ static int SplitFp16Run(void *cdata, int task_id) { } int SplitFp16CPUKernel::Run() { - input_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); - if (input_ptr_ == nullptr) { - MS_LOG(ERROR) << "input or output is nullptr"; - return RET_ERROR; - } + auto input_tensor = in_tensors_.at(0); + input_ptr_ = reinterpret_cast(input_tensor->data_c()); + for (int i = 0; i < param->num_split_; i++) { - output_ptr_.at(i) = MallocOutputFp16(out_tensors_.at(i), context_); - if (output_ptr_.at(i) == nullptr) { - FreeInputAndOutput(); - MS_LOG(ERROR) << "input or output is nullptr"; - return RET_ERROR; - } + auto output_tensor = out_tensors_.at(i); + output_ptr_.at(i) = reinterpret_cast(output_tensor->data_c()); } + auto ret = ParallelLaunch(this->context_->thread_pool_, SplitFp16Run, this, thread_n_num_); - for (int i = 0; i < param->num_split_; i++) { - if (out_tensors_.at(i)->data_type() == kNumberTypeFloat32) { - Float16ToFloat32(output_ptr_.at(i), reinterpret_cast(out_tensors_.at(i)->MutableData()), - out_tensors_.at(i)->ElementsNum()); - } - } - FreeInputAndOutput(); if (ret != RET_OK) { MS_LOG(ERROR) << "split error error_code[" << ret << "]"; } return ret; } - -void SplitFp16CPUKernel::FreeInputAndOutput() { - if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(input_ptr_); - input_ptr_ = nullptr; - } - for (int i = 0; i < param->num_split_; i++) { - if (out_tensors_.at(i)->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(output_ptr_.at(i)); - output_ptr_.at(i) = nullptr; - } - } -} - REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Split, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.h index e10bbcea60..f34c2d0ab3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.h @@ -39,7 +39,6 @@ class SplitFp16CPUKernel : public SplitBaseCPUKernel { private: float16_t *input_ptr_ = nullptr; std::vector output_ptr_; - void FreeInputAndOutput(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc index 22a191bb31..7df3f72a83 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc @@ -41,12 +41,6 @@ int GatherCPUKernel::DoGather(int task_id) { auto indices_tensor = in_tensors_.at(1); auto out_tensor = out_tensors_.at(0); - auto input_ptr = reinterpret_cast(input_tensor->MutableData()); - auto output_ptr = reinterpret_cast(out_tensor->MutableData()); - - auto input_int32 = reinterpret_cast(input_tensor->MutableData()); - auto output_int32 = reinterpret_cast(out_tensor->MutableData()); - auto in_shape = input_tensor->shape(); int in_rank = in_shape.size(); int indices_element_size = indices_tensor->ElementsNum(); @@ -65,16 +59,15 @@ int GatherCPUKernel::DoGather(int task_id) { int count = MSMIN(stride, outer_size - stride * task_id); auto thread_stride = stride * task_id; - int error_code; - if (input_tensor->data_type() == kNumberTypeInt32) { - input_int32 += thread_stride * limit; - output_int32 += thread_stride * indices_element_size; - error_code = GatherInt32(input_int32, count, inner_size, limit, indices_data_, indices_element_size, output_int32); - } else { - input_ptr += thread_stride * limit; - output_ptr += thread_stride * indices_element_size; - error_code = GatherFp32(input_ptr, count, inner_size, limit, indices_data_, indices_element_size, output_ptr); - } + int8_t *int8_in = reinterpret_cast(input_tensor->data_c()); + int8_t *int8_out = reinterpret_cast(out_tensor->data_c()); + + int data_size = lite::DataTypeSize(input_tensor->data_type()); + int8_in += thread_stride * limit * data_size; + int8_out += thread_stride * indices_element_size * data_size; + + int error_code = Gather(int8_in, count, inner_size, limit, indices_data_, indices_element_size, int8_out, data_size); + return error_code; }